<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/c422_Mounika/DAY_6_Mutiple_resocures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Step 1: Set Up Environment
!pip install pandas requests -q


In [None]:
# Step 2: Ingest Data from Multiple Sources
# a. CSV File
import pandas as pd

csv_url = "https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv"
df_csv = pd.read_csv(csv_url)
print("CSV Data Sample:")
print(df_csv.head())

# b. JSON File
import json

json_url = "https://jsonplaceholder.typicode.com/users"
df_json = pd.read_json(json_url)
print("\nJSON Data Sample:")
print(df_json.head())

#c. REST API
import requests

api_url = "https://randomuser.me/api/?results=5"
response = requests.get(api_url)
data = response.json()
df_api = pd.json_normalize(data['results'])
print("\nREST API Data Sample:")
print(df_api.head())

CSV Data Sample:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa

JSON Data Sample:
   id              name   username                      email  \
0   1     Leanne Graham       Bret          Sincere@april.biz   
1   2      Ervin Howell  Antonette          Shanna@melissa.tv   
2   3  Clementine Bauch   Samantha         Nathan@yesenia.net   
3   4  Patricia Lebsack   Karianne  Julianne.OConner@kory.org   
4   5  Chelsey Dietrich     Kamren   Lucio_Hettinger@annie.ca   

                                             address                  phone  \
0  {'street': 'Kulas Light', 'suite': 'Apt. 556',...  1-770-736-8031 x56442   
1  {'street': 'Victor Plains', 

In [None]:
# Step 2: Ingest Data from Multiple Sources
# a. CSV File
import pandas as pd

csv_url = "https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv"
df_csv = pd.read_csv(csv_url)
print("CSV Data Sample:")
print(df_csv.columns)

# b. JSON File
import json

json_url = "https://jsonplaceholder.typicode.com/users"
df_json = pd.read_json(json_url)
print("\nJSON Data Sample:")
print(df_json.head())

#c. REST API
import requests

api_url = "https://randomuser.me/api/?results=5"
response = requests.get(api_url)
data = response.json()
df_api = pd.json_normalize(data['results'])
print("\nREST API Data Sample:")
print(df_api.head())

CSV Data Sample:
Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

JSON Data Sample:
   id              name   username                      email  \
0   1     Leanne Graham       Bret          Sincere@april.biz   
1   2      Ervin Howell  Antonette          Shanna@melissa.tv   
2   3  Clementine Bauch   Samantha         Nathan@yesenia.net   
3   4  Patricia Lebsack   Karianne  Julianne.OConner@kory.org   
4   5  Chelsey Dietrich     Kamren   Lucio_Hettinger@annie.ca   

                                             address                  phone  \
0  {'street': 'Kulas Light', 'suite': 'Apt. 556',...  1-770-736-8031 x56442   
1  {'street': 'Victor Plains', 'suite': 'Suite 87...    010-692-6593 x09125   
2  {'street': 'Douglas Extension', 'suite': 'Suit...         1-463-123-4447   
3  {'street': 'Hoeger Mall', 'suite': 'Apt. 692',...      493-170-9623 x156   
4  {'street': 'Skiles Walks', 'suite': 'Suite 351...          (254)9

In [None]:
# Step 3: Modular Cleaning/Transformation
# Example: Clean and select specific columns from each

# For CSV (Iris), let's only keep numeric columns and rename
df_csv_clean = df_csv.rename(columns={'species':'source'}).dropna()

# For JSON (User info), select name and email
df_json_clean = df_json[['name', 'email']].copy()
df_json_clean['source'] = 'json'

# For API data (Random users), grab first/last name, email
df_api_clean = pd.DataFrame()
df_api_clean['name'] = df_api['name.first'] + " " + df_api['name.last']
df_api_clean['email'] = df_api['email']
df_api_clean['source'] = 'api'

In [None]:
# Step 4: Prepare each cleaned DataFrame with identical columns

common_cols = ['name', 'email', 'source',
               'sepal_length', 'sepal_width', 'petal_length', 'petal_width']

# CSV (Iris) — rename species to name, add missing columns
df_csv_clean = df_csv.rename(columns={'species': 'name'})
df_csv_clean['email'] = None
df_csv_clean['source'] = 'csv'
for col in ['sepal_length','sepal_width','petal_length','petal_width']:
    # numeric columns already exist
    pass

# JSON (Users) — add placeholder iris columns
df_json_clean = df_json[['name','email']].copy()
df_json_clean['source'] = 'json'
for col in ['sepal_length','sepal_width','petal_length','petal_width']:
    df_json_clean[col] = None

# API (Random Users) — add placeholder iris columns
df_api_clean = pd.DataFrame({
    'name': df_api['name.first'] + ' ' + df_api['name.last'],
    'email': df_api['email'],
    'source': 'api'
})
for col in ['sepal_length','sepal_width','petal_length','petal_width']:
    df_api_clean[col] = None

# Step 5: Concatenate into unified DataFrame
unified_df = pd.concat([
    df_csv_clean[common_cols],
    df_json_clean[common_cols],
    df_api_clean[common_cols]
], ignore_index=True)

print("\nUnified Clean Dataset Sample:")
print(unified_df.head(10))


Unified Clean Dataset Sample:
     name email source  sepal_length  sepal_width  petal_length  petal_width
0  setosa  None    csv           5.1          3.5           1.4          0.2
1  setosa  None    csv           4.9          3.0           1.4          0.2
2  setosa  None    csv           4.7          3.2           1.3          0.2
3  setosa  None    csv           4.6          3.1           1.5          0.2
4  setosa  None    csv           5.0          3.6           1.4          0.2
5  setosa  None    csv           5.4          3.9           1.7          0.4
6  setosa  None    csv           4.6          3.4           1.4          0.3
7  setosa  None    csv           5.0          3.4           1.5          0.2
8  setosa  None    csv           4.4          2.9           1.4          0.2
9  setosa  None    csv           4.9          3.1           1.5          0.1


  unified_df = pd.concat([


In [None]:
# Step 4: Prepare each cleaned DataFrame with identical columns

common_cols = ['name', 'email', 'source',
               'sepal_length', 'sepal_width', 'petal_length']

# CSV (Iris) — rename species to name, add missing columns
df_csv_clean = df_csv.rename(columns={'species': 'name'})
df_csv_clean['email'] = None
df_csv_clean['source'] = 'csv'
for col in ['sepal_length','sepal_width','petal_length']:
    # numeric columns already exist
    pass

# JSON (Users) — add placeholder iris columns
df_json_clean = df_json[['name','email']].copy()
df_json_clean['source'] = 'json'
for col in ['sepal_length','sepal_width','petal_length',]:
    df_json_clean[col] = None

# API (Random Users) — add placeholder iris columns
df_api_clean = pd.DataFrame({
    'name': df_api['name.first'] + ' ' + df_api['name.last'],
    'email': df_api['email'],
    'source': 'api'
})
for col in ['sepal_length','sepal_width','petal_length',]:
    df_api_clean[col] = None

# Step 5: Concatenate into unified DataFrame
unified_df = pd.concat([
    df_csv_clean[common_cols],
    df_json_clean[common_cols],
    df_api_clean[common_cols]
], ignore_index=True)

print("\nUnified Clean Dataset Sample:")
print(unified_df.head(10))


Unified Clean Dataset Sample:
     name email source  sepal_length  sepal_width  petal_length
0  setosa  None    csv           5.1          3.5           1.4
1  setosa  None    csv           4.9          3.0           1.4
2  setosa  None    csv           4.7          3.2           1.3
3  setosa  None    csv           4.6          3.1           1.5
4  setosa  None    csv           5.0          3.6           1.4
5  setosa  None    csv           5.4          3.9           1.7
6  setosa  None    csv           4.6          3.4           1.4
7  setosa  None    csv           5.0          3.4           1.5
8  setosa  None    csv           4.4          2.9           1.4
9  setosa  None    csv           4.9          3.1           1.5


  unified_df = pd.concat([


In [None]:
import pandas as pd
import requests

# CSV: Historical weather
csv_url = "https://raw.githubusercontent.com/fivethirtyeight/data/master/us-weather-history/KCLT.csv"
df_csv = pd.read_csv(csv_url)
df_csv_clean = df_csv[['date', 'actual_mean_temp']].copy()
df_csv_clean.columns = ['date', 'value']
df_csv_clean['source'] = 'csv'

# JSON: Sample API data
json_url = "https://raw.githubusercontent.com/json-iterator/test-data/master/large-file.json" # Using a different JSON URL
df_json = pd.read_json(json_url)
df_json_clean = pd.DataFrame({
    'date': [pd.Timestamp.today().strftime('%Y-%m-%d')],
    'value': [len(df_json)],
    'source': ['json']
})

# REST API: Current weather data
api_url = "https://api.open-meteo.com/v1/forecast?latitude=52.52&longitude=13.41&current_weather=true"
response = requests.get(api_url)
if response.status_code == 200:
    current_temp = response.json()['current_weather']['temperature']
    df_api_clean = pd.DataFrame({
        'date': [pd.Timestamp.today().strftime('%Y-%m-%d')],
        'value': [current_temp],
        'source': ['api']
    })
else:
    df_api_clean = pd.DataFrame(columns=['date', 'value', 'source'])

# Combine all
unified_df = pd.concat([df_csv_clean, df_json_clean, df_api_clean], ignore_index=True)
print(unified_df.head(10))

        date  value source
0   2014-7-1   81.0    csv
1   2014-7-2   85.0    csv
2   2014-7-3   82.0    csv
3   2014-7-4   75.0    csv
4   2014-7-5   72.0    csv
5   2014-7-6   74.0    csv
6   2014-7-7   79.0    csv
7   2014-7-8   83.0    csv
8   2014-7-9   80.0    csv
9  2014-7-10   78.0    csv


In [None]:
import pandas as pd
import requests

# Step 1: Load CSV - Online Retail Sales
csv_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"
df_csv = pd.read_excel(csv_url)

# Clean CSV data: Use Description and UnitPrice
df_csv_clean = df_csv[['Description', 'UnitPrice']].dropna()
df_csv_clean.columns = ['product_name', 'price']
df_csv_clean['source'] = 'csv'

# Step 2: Load JSON - Sample Product Test Results
# Using a different JSON URL with a simpler structure
json_url = "https://jsonplaceholder.typicode.com/posts"
response = requests.get(json_url)
data = response.json()

# Create DataFrame from the JSON data
# Assuming the new JSON has a list of dictionaries with 'title' and 'id'
json_products = []
for item in data:
    if 'title' in item and 'id' in item:
        # Using 'title' as product_name and 'id' as a mock price
        json_products.append({'product_name': item['title'], 'price': item['id']})

df_json_clean = pd.DataFrame(json_products)
df_json_clean['source'] = 'json'


# Step 3: Load API - Fake Store Products
api_url = "https://fakestoreapi.com/products"
response = requests.get(api_url)

if response.status_code == 200:
    api_data = response.json()
    df_api_clean = pd.DataFrame(api_data)[['title', 'price']]
    df_api_clean.columns = ['product_name', 'price']
    df_api_clean['source'] = 'api'
else:
    df_api_clean = pd.DataFrame(columns=['product_name', 'price', 'source'])


# Step 4: Combine All Sources
unified_df = pd.concat([df_csv_clean, df_json_clean, df_api_clean], ignore_index=True)

# Step 5: Preview Output
print("Unified Product Dataset (First 10 Rows):")
print(unified_df.head(10))

Unified Product Dataset (First 10 Rows):
                          product_name  price source
0   WHITE HANGING HEART T-LIGHT HOLDER   2.55    csv
1                  WHITE METAL LANTERN   3.39    csv
2       CREAM CUPID HEARTS COAT HANGER   2.75    csv
3  KNITTED UNION FLAG HOT WATER BOTTLE   3.39    csv
4       RED WOOLLY HOTTIE WHITE HEART.   3.39    csv
5         SET 7 BABUSHKA NESTING BOXES   7.65    csv
6    GLASS STAR FROSTED T-LIGHT HOLDER   4.25    csv
7               HAND WARMER UNION JACK   1.85    csv
8            HAND WARMER RED POLKA DOT   1.85    csv
9        ASSORTED COLOUR BIRD ORNAMENT   1.69    csv
