In [4]:
# Step 2: Ingest Data from Multiple Sources
# a. CSV File
import pandas as pd

csv_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"
df_csv = pd.read_excel(csv_url)
print("CSV Data Sample:")
print(df_csv.head())

# b. JSON File
import json

json_url = "https://jsonplaceholder.typicode.com/users"
df_json = pd.read_json(json_url)
print("\nJSON Data Sample:")
print(df_json.head())

#c. REST API
import requests

api_url = "https://fakestoreapi.com/products"
response = requests.get(api_url)
data = response.json()
df_api = pd.json_normalize(data)
print("\nREST API Data Sample:")
print(df_api.head())


CSV Data Sample:
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  

JSON Data Sample:
   id              name   username                      email  \
0   1     Leanne Graham       Bret          Sincere@april.biz   
1   2      Ervin Howell

Identical columns: ['source', 'identifier', 'name', 'description', 'price', 'country']


In [5]:
df_csv_clean = df_csv[['InvoiceNo', 'Description', 'UnitPrice', 'Country']].copy()
df_csv_clean['source'] = 'CSV'
df_csv_clean.rename(columns={
    'InvoiceNo': 'identifier',
    'Description': 'name',
    'UnitPrice': 'price',
    'Country': 'country'
}, inplace=True)
df_csv_clean['description'] = None  # No detailed description available
df_csv_clean = df_csv_clean[['source', 'identifier', 'name', 'description', 'price', 'country']]

In [6]:
df_json_clean = df_json[['id', 'name', 'email']].copy()
df_json_clean['source'] = 'JSON'
df_json_clean.rename(columns={
    'id': 'identifier',
    'email': 'description'
}, inplace=True)
df_json_clean['price'] = None
df_json_clean['country'] = 'N/A'
df_json_clean = df_json_clean[['source', 'identifier', 'name', 'description', 'price', 'country']]

In [7]:
df_api_clean = df_api[['id', 'title', 'description', 'price', 'category']].copy()
df_api_clean['source'] = 'API'
df_api_clean.rename(columns={
    'id': 'identifier',
    'title': 'name',
    'category': 'country'  # For demo; no actual country, so use category
}, inplace=True)
df_api_clean = df_api_clean[['source', 'identifier', 'name', 'description', 'price', 'country']]

In [8]:
df_unified = pd.concat([df_csv_clean, df_json_clean, df_api_clean], ignore_index=True)
print("Unified DataFrame Sample:")
print(df_unified.head(10))

Unified DataFrame Sample:
  source identifier                                 name description  price  \
0    CSV     536365   WHITE HANGING HEART T-LIGHT HOLDER        None   2.55   
1    CSV     536365                  WHITE METAL LANTERN        None   3.39   
2    CSV     536365       CREAM CUPID HEARTS COAT HANGER        None   2.75   
3    CSV     536365  KNITTED UNION FLAG HOT WATER BOTTLE        None   3.39   
4    CSV     536365       RED WOOLLY HOTTIE WHITE HEART.        None   3.39   
5    CSV     536365         SET 7 BABUSHKA NESTING BOXES        None   7.65   
6    CSV     536365    GLASS STAR FROSTED T-LIGHT HOLDER        None   4.25   
7    CSV     536366               HAND WARMER UNION JACK        None   1.85   
8    CSV     536366            HAND WARMER RED POLKA DOT        None   1.85   
9    CSV     536367        ASSORTED COLOUR BIRD ORNAMENT        None   1.69   

          country  
0  United Kingdom  
1  United Kingdom  
2  United Kingdom  
3  United Kingdom  
4  U

  df_unified = pd.concat([df_csv_clean, df_json_clean, df_api_clean], ignore_index=True)
