In [5]:
# Step 2: Ingest Data from Multiple Sources

# a. CSV File
import pandas as pd
import requests

csv_url = "https://raw.githubusercontent.com/fivethirtyeight/data/master/us-weather-history/KCLT.csv"
df_csv = pd.read_csv(csv_url)
print("CSV Data Sample:")
print(df_csv.head())

# b. JSON File
json_url = "https://jsonplaceholder.typicode.com/users"
df_json = pd.read_json(json_url)
print("\nJSON Data Sample:")
print(df_json.head())

# c. REST API (Open-Meteo)
api_url = "https://api.open-meteo.com/v1/forecast?latitude=52.52&longitude=13.41&current_weather=true"
response = requests.get(api_url)

if response.status_code == 200:
    data = response.json()
    df_api = pd.json_normalize(data['current_weather'])
    print("\nREST API Data Sample:")
    print(df_api.head(1))
else:
    print("Failed to fetch data. Status code:", response.status_code)


CSV Data Sample:
       date  actual_mean_temp  actual_min_temp  actual_max_temp  \
0  2014-7-1                81               70               91   
1  2014-7-2                85               74               95   
2  2014-7-3                82               71               93   
3  2014-7-4                75               64               86   
4  2014-7-5                72               60               84   

   average_min_temp  average_max_temp  record_min_temp  record_max_temp  \
0                67                89               56              104   
1                68                89               56              101   
2                68                89               56               99   
3                68                89               55               99   
4                68                89               57              100   

   record_min_temp_year  record_max_temp_year  actual_precipitation  \
0                  1919                  2012             

Identical columns: ['source', 'location', 'date', 'temperature', 'windspeed', 'extra_info']


In [6]:
# Clean CSV (Weather Data for Charlotte, NC)
df_csv_clean = df_csv[['date', 'actual_mean_temp']].copy()
df_csv_clean['source'] = 'CSV'
df_csv_clean['location'] = 'Charlotte, NC'
df_csv_clean.rename(columns={'actual_mean_temp': 'temperature', 'date': 'date'}, inplace=True)
df_csv_clean['windspeed'] = None
df_csv_clean['extra_info'] = None


In [7]:
# Clean JSON (User Data - Simulated temperature info from fake users)
df_json_clean = df_json[['id', 'name', 'email']].copy()
df_json_clean['source'] = 'JSON'
df_json_clean['date'] = pd.Timestamp.today().strftime('%Y-%m-%d')
df_json_clean['location'] = df_json_clean['name']
df_json_clean['temperature'] = None
df_json_clean['windspeed'] = None
df_json_clean['extra_info'] = df_json_clean['email']
df_json_clean = df_json_clean[['source', 'location', 'date', 'temperature', 'windspeed', 'extra_info']]

In [8]:
# Clean API (Current Weather Data for Berlin)
df_api_clean = df_api.copy()
df_api_clean['source'] = 'API'
df_api_clean['location'] = 'Berlin'
df_api_clean.rename(columns={'time': 'date'}, inplace=True)
df_api_clean['extra_info'] = f"WeatherCode: {df_api_clean['weathercode'].iloc[0]}"
df_api_clean = df_api_clean[['source', 'location', 'date', 'temperature', 'windspeed', 'extra_info']]

In [10]:
df_unified = pd.concat([df_csv_clean, df_json_clean, df_api_clean], ignore_index=True)
print("\nUnified DataFrame:")
print(df_unified.head(10))


Unified DataFrame:
        date  temperature source       location  windspeed extra_info
0   2014-7-1         81.0    CSV  Charlotte, NC        NaN       None
1   2014-7-2         85.0    CSV  Charlotte, NC        NaN       None
2   2014-7-3         82.0    CSV  Charlotte, NC        NaN       None
3   2014-7-4         75.0    CSV  Charlotte, NC        NaN       None
4   2014-7-5         72.0    CSV  Charlotte, NC        NaN       None
5   2014-7-6         74.0    CSV  Charlotte, NC        NaN       None
6   2014-7-7         79.0    CSV  Charlotte, NC        NaN       None
7   2014-7-8         83.0    CSV  Charlotte, NC        NaN       None
8   2014-7-9         80.0    CSV  Charlotte, NC        NaN       None
9  2014-7-10         78.0    CSV  Charlotte, NC        NaN       None


  df_unified = pd.concat([df_csv_clean, df_json_clean, df_api_clean], ignore_index=True)
