In [4]:
!pip install pandas requests -q

In [5]:
import pandas as pd
import requests

csv_url = "https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv"
df_csv = pd.read_csv(csv_url)

print("CSV Data Sample:")
print(df_csv.head())


CSV Data Sample:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [6]:
json_url = "https://jsonplaceholder.typicode.com/users"
df_json = pd.read_json(json_url)

print("\nJSON Data Sample:")
print(df_json.head())



JSON Data Sample:
   id              name   username                      email  \
0   1     Leanne Graham       Bret          Sincere@april.biz   
1   2      Ervin Howell  Antonette          Shanna@melissa.tv   
2   3  Clementine Bauch   Samantha         Nathan@yesenia.net   
3   4  Patricia Lebsack   Karianne  Julianne.OConner@kory.org   
4   5  Chelsey Dietrich     Kamren   Lucio_Hettinger@annie.ca   

                                             address                  phone  \
0  {'street': 'Kulas Light', 'suite': 'Apt. 556',...  1-770-736-8031 x56442   
1  {'street': 'Victor Plains', 'suite': 'Suite 87...    010-692-6593 x09125   
2  {'street': 'Douglas Extension', 'suite': 'Suit...         1-463-123-4447   
3  {'street': 'Hoeger Mall', 'suite': 'Apt. 692',...      493-170-9623 x156   
4  {'street': 'Skiles Walks', 'suite': 'Suite 351...          (254)954-1289   

         website                                            company  
0  hildegard.org  {'name': 'Romaguera-Crona',

In [7]:
api_url = "https://randomuser.me/api/?results=5"
response = requests.get(api_url)
data = response.json()
df_api = pd.json_normalize(data['results'])

print("\nREST API Data Sample:")
print(df_api.head())



REST API Data Sample:
  gender                           email           phone            cell nat  \
0   male          diego.mora@example.com     953-863-093     656-154-185  ES   
1   male         gautam.nand@example.com      8172361080      7681752215  IN   
2   male      alex.guillaume@example.com  05-81-05-74-67  06-95-36-16-59  FR   
3   male    nelson.patterson@example.com    04-9699-2582    0448-043-406  AU   
4   male  shivansh.prajapati@example.com      9264151850      7565834772  IN   

  name.title name.first  name.last  location.street.number  \
0         Mr      Diego       Mora                    8985   
1         Mr     Gautam       Nand                    5522   
2         Mr       Alex  Guillaume                    6038   
3         Mr     Nelson  Patterson                    7335   
4         Mr   Shivansh  Prajapati                    6034   

  location.street.name  ...  \
0   Avenida de América  ...   
1            Pali Hill  ...   
2       Rue du Village  ...   

In [8]:
# Clean CSV (Iris) data
df_csv_clean = df_csv.rename(columns={'species': 'name'})
df_csv_clean['email'] = None
df_csv_clean['source'] = 'csv'

# Clean JSON (User data)
df_json_clean = df_json[['name', 'email']].copy()
df_json_clean['source'] = 'json'
for col in ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']:
    df_json_clean[col] = None

# Clean REST API (Random users)
df_api_clean = pd.DataFrame({
    'name': df_api['name.first'] + ' ' + df_api['name.last'],
    'email': df_api['email'],
    'source': 'api'
})
for col in ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']:
    df_api_clean[col] = None


In [9]:
# Define common columns
common_cols = ['name', 'email', 'source',
               'sepal_length', 'sepal_width', 'petal_length', 'petal_width']

# Concatenate all sources
unified_df = pd.concat([
    df_csv_clean[common_cols],
    df_json_clean[common_cols],
    df_api_clean[common_cols]
], ignore_index=True)

print("\nUnified Clean Dataset Sample:")
print(unified_df.head(10))



Unified Clean Dataset Sample:
     name email source  sepal_length  sepal_width  petal_length  petal_width
0  setosa  None    csv           5.1          3.5           1.4          0.2
1  setosa  None    csv           4.9          3.0           1.4          0.2
2  setosa  None    csv           4.7          3.2           1.3          0.2
3  setosa  None    csv           4.6          3.1           1.5          0.2
4  setosa  None    csv           5.0          3.6           1.4          0.2
5  setosa  None    csv           5.4          3.9           1.7          0.4
6  setosa  None    csv           4.6          3.4           1.4          0.3
7  setosa  None    csv           5.0          3.4           1.5          0.2
8  setosa  None    csv           4.4          2.9           1.4          0.2
9  setosa  None    csv           4.9          3.1           1.5          0.1


  unified_df = pd.concat([


In [10]:
# 1. Standardization challenge
print("\n1. The most standardization was needed in aligning column names and missing columns across sources.")

# 2. Common merging issues
print("\n2. Common problems include inconsistent schemas, missing values, or nested/unstructured data formats.")

# 3. Importance of central ingestion layer
print("\n3. A central ingestion and transformation layer ensures consistency, reduces duplication, and enables scalability when handling diverse datasets.")



1. The most standardization was needed in aligning column names and missing columns across sources.

2. Common problems include inconsistent schemas, missing values, or nested/unstructured data formats.

3. A central ingestion and transformation layer ensures consistency, reduces duplication, and enables scalability when handling diverse datasets.
