In [1]:
# pip install pandas mysql-connector-python

# ETL Process for Amazon Dataset

## Extract - Extracting data from MongoDB(Semi-structured)

In [2]:
#Import neccessary libraries
import pymongo
import pandas as pd
import mysql.connector

In [3]:
#connect to MongoDB server
client = pymongo.MongoClient('mongodb://dap:dap@localhost:27017/?authMechanism=DEFAULT', serverSelectionTimeoutMS=50000)
database_name = 'DAP_Project'
collection_list = client.list_database_names()
collection_list


['DAP_Project', 'admin', 'config', 'local']

In [4]:
# Check if database name exist - 
    #if yes - then it will access collection and load data in Amazon_data
    #if no - it will raise ValueError
    
if database_name in collection_list:
    db = client[database_name]
    Amazon_collection = db['Amazon']
    Amazon_data = Amazon_collection.find()
else:
    raise ValueError(f"Database {database_name} not found in the collection list")

In [5]:
#convert data into pandas DataFrame
amazon_df = pd.DataFrame(list(Amazon_data))

In [6]:
amazon_df

Unnamed: 0,_id,title,price,rating,reviews,availability
0,644c3ccb499890cfbaf2a400,"Samsung Galaxy M04 Light Green, 4GB RAM, 64GB ...","₹8,499",3.9 out of 5 stars,"3,886 ratings",In stock
1,644c3ccb499890cfbaf2a401,"Samsung Galaxy S23 5G (Phantom Black, 8GB, 256...","₹79,999",4.5 out of 5 stars,699 ratings,In stock
2,644c3ccb499890cfbaf2a402,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, 8GB...","₹19,999",3.6 out of 5 stars,18 ratings,In stock
3,644c3ccb499890cfbaf2a403,"Samsung Galaxy M04 Light Green, 4GB RAM, 64GB ...","₹8,499",3.9 out of 5 stars,"3,886 ratings",In stock
4,644c3ccb499890cfbaf2a404,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, 8GB RA...","₹21,999",3.6 out of 5 stars,18 ratings,In stock
...,...,...,...,...,...,...
529,644c3ccb499890cfbaf2a611,Redmi 9A (Midnight Black 2GB RAM 32GB Storage)...,"₹7,480",4.1 out of 5 stars,"322,513 ratings",In stock
530,644c3ccb499890cfbaf2a612,"Samsung Galaxy M04 Dark Blue, 4GB RAM, 64GB St...","₹8,499",3.9 out of 5 stars,"3,886 ratings",In stock
531,644c3ccb499890cfbaf2a613,"realme narzo 50 (Speed Black, 6GB RAM+128GB St...","₹15,499",4.2 out of 5 stars,"16,097 ratings",In stock
532,644c3ccb499890cfbaf2a614,"Oppo A78 5G (Glowing Black, 8GB RAM, 128 Stora...","₹18,999",4.1 out of 5 stars,914 ratings,In stock


## Transform - Cleaning of data

In [7]:
#checking record for reviews column for 0th index
amazon_df['reviews'][0]

'3,886 ratings'

In [8]:
# Dataframe contains blank spaces so replace blank with none.

amazon_df['price'] = amazon_df['price'].replace('', None)
amazon_df['reviews'] = amazon_df['reviews'].replace('', None)
amazon_df['availability'] = amazon_df['availability'].replace('', None)



In [9]:
# Check if null values exists
amazon_df.isnull().sum()

_id              0
title            0
price            1
rating           0
reviews         22
availability     2
dtype: int64

In [10]:
amazon_df['price'] = amazon_df['price'].str.replace('₹', '').str.replace(',', '').astype(float)

In [11]:
amazon_df

Unnamed: 0,_id,title,price,rating,reviews,availability
0,644c3ccb499890cfbaf2a400,"Samsung Galaxy M04 Light Green, 4GB RAM, 64GB ...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
1,644c3ccb499890cfbaf2a401,"Samsung Galaxy S23 5G (Phantom Black, 8GB, 256...",79999.0,4.5 out of 5 stars,699 ratings,In stock
2,644c3ccb499890cfbaf2a402,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, 8GB...",19999.0,3.6 out of 5 stars,18 ratings,In stock
3,644c3ccb499890cfbaf2a403,"Samsung Galaxy M04 Light Green, 4GB RAM, 64GB ...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
4,644c3ccb499890cfbaf2a404,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, 8GB RA...",21999.0,3.6 out of 5 stars,18 ratings,In stock
...,...,...,...,...,...,...
529,644c3ccb499890cfbaf2a611,Redmi 9A (Midnight Black 2GB RAM 32GB Storage)...,7480.0,4.1 out of 5 stars,"322,513 ratings",In stock
530,644c3ccb499890cfbaf2a612,"Samsung Galaxy M04 Dark Blue, 4GB RAM, 64GB St...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
531,644c3ccb499890cfbaf2a613,"realme narzo 50 (Speed Black, 6GB RAM+128GB St...",15499.0,4.2 out of 5 stars,"16,097 ratings",In stock
532,644c3ccb499890cfbaf2a614,"Oppo A78 5G (Glowing Black, 8GB RAM, 128 Stora...",18999.0,4.1 out of 5 stars,914 ratings,In stock


In [12]:
amazon_df['title'] = amazon_df['title'].str.replace('(\d+GB) RAM.*(\d+GB|\d+TB) Storage', '')

  amazon_df['title'] = amazon_df['title'].str.replace('(\d+GB) RAM.*(\d+GB|\d+TB) Storage', '')


In [13]:
amazon_df

Unnamed: 0,_id,title,price,rating,reviews,availability
0,644c3ccb499890cfbaf2a400,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
1,644c3ccb499890cfbaf2a401,"Samsung Galaxy S23 5G (Phantom Black, 8GB, 256...",79999.0,4.5 out of 5 stars,699 ratings,In stock
2,644c3ccb499890cfbaf2a402,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, )",19999.0,3.6 out of 5 stars,18 ratings,In stock
3,644c3ccb499890cfbaf2a403,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
4,644c3ccb499890cfbaf2a404,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, )",21999.0,3.6 out of 5 stars,18 ratings,In stock
...,...,...,...,...,...,...
529,644c3ccb499890cfbaf2a611,Redmi 9A (Midnight Black ) | 2GHz Octa-core He...,7480.0,4.1 out of 5 stars,"322,513 ratings",In stock
530,644c3ccb499890cfbaf2a612,"Samsung Galaxy M04 Dark Blue, | Upto 8GB RAM ...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
531,644c3ccb499890cfbaf2a613,"realme narzo 50 (Speed Black, ) Helio G96 Proc...",15499.0,4.2 out of 5 stars,"16,097 ratings",In stock
532,644c3ccb499890cfbaf2a614,"Oppo A78 5G (Glowing Black, 8GB RAM, 128 Stora...",18999.0,4.1 out of 5 stars,914 ratings,In stock


In [14]:
amazon_df = amazon_df.apply(lambda x: x.str.strip() if x.dtype == 'object' and x.name != '_id' else x)


In [15]:
amazon_df

Unnamed: 0,_id,title,price,rating,reviews,availability
0,644c3ccb499890cfbaf2a400,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
1,644c3ccb499890cfbaf2a401,"Samsung Galaxy S23 5G (Phantom Black, 8GB, 256...",79999.0,4.5 out of 5 stars,699 ratings,In stock
2,644c3ccb499890cfbaf2a402,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, )",19999.0,3.6 out of 5 stars,18 ratings,In stock
3,644c3ccb499890cfbaf2a403,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
4,644c3ccb499890cfbaf2a404,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, )",21999.0,3.6 out of 5 stars,18 ratings,In stock
...,...,...,...,...,...,...
529,644c3ccb499890cfbaf2a611,Redmi 9A (Midnight Black ) | 2GHz Octa-core He...,7480.0,4.1 out of 5 stars,"322,513 ratings",In stock
530,644c3ccb499890cfbaf2a612,"Samsung Galaxy M04 Dark Blue, | Upto 8GB RAM ...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
531,644c3ccb499890cfbaf2a613,"realme narzo 50 (Speed Black, ) Helio G96 Proc...",15499.0,4.2 out of 5 stars,"16,097 ratings",In stock
532,644c3ccb499890cfbaf2a614,"Oppo A78 5G (Glowing Black, 8GB RAM, 128 Stora...",18999.0,4.1 out of 5 stars,914 ratings,In stock


In [16]:
amazon_df = amazon_df.reset_index(drop=True)

In [17]:
amazon_df

Unnamed: 0,_id,title,price,rating,reviews,availability
0,644c3ccb499890cfbaf2a400,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
1,644c3ccb499890cfbaf2a401,"Samsung Galaxy S23 5G (Phantom Black, 8GB, 256...",79999.0,4.5 out of 5 stars,699 ratings,In stock
2,644c3ccb499890cfbaf2a402,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, )",19999.0,3.6 out of 5 stars,18 ratings,In stock
3,644c3ccb499890cfbaf2a403,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
4,644c3ccb499890cfbaf2a404,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, )",21999.0,3.6 out of 5 stars,18 ratings,In stock
...,...,...,...,...,...,...
529,644c3ccb499890cfbaf2a611,Redmi 9A (Midnight Black ) | 2GHz Octa-core He...,7480.0,4.1 out of 5 stars,"322,513 ratings",In stock
530,644c3ccb499890cfbaf2a612,"Samsung Galaxy M04 Dark Blue, | Upto 8GB RAM ...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
531,644c3ccb499890cfbaf2a613,"realme narzo 50 (Speed Black, ) Helio G96 Proc...",15499.0,4.2 out of 5 stars,"16,097 ratings",In stock
532,644c3ccb499890cfbaf2a614,"Oppo A78 5G (Glowing Black, 8GB RAM, 128 Stora...",18999.0,4.1 out of 5 stars,914 ratings,In stock


In [18]:
amazon_df['reviews'].fillna("0 ratings", inplace=True)

In [19]:
amazon_df

Unnamed: 0,_id,title,price,rating,reviews,availability
0,644c3ccb499890cfbaf2a400,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
1,644c3ccb499890cfbaf2a401,"Samsung Galaxy S23 5G (Phantom Black, 8GB, 256...",79999.0,4.5 out of 5 stars,699 ratings,In stock
2,644c3ccb499890cfbaf2a402,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, )",19999.0,3.6 out of 5 stars,18 ratings,In stock
3,644c3ccb499890cfbaf2a403,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
4,644c3ccb499890cfbaf2a404,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, )",21999.0,3.6 out of 5 stars,18 ratings,In stock
...,...,...,...,...,...,...
529,644c3ccb499890cfbaf2a611,Redmi 9A (Midnight Black ) | 2GHz Octa-core He...,7480.0,4.1 out of 5 stars,"322,513 ratings",In stock
530,644c3ccb499890cfbaf2a612,"Samsung Galaxy M04 Dark Blue, | Upto 8GB RAM ...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
531,644c3ccb499890cfbaf2a613,"realme narzo 50 (Speed Black, ) Helio G96 Proc...",15499.0,4.2 out of 5 stars,"16,097 ratings",In stock
532,644c3ccb499890cfbaf2a614,"Oppo A78 5G (Glowing Black, 8GB RAM, 128 Stora...",18999.0,4.1 out of 5 stars,914 ratings,In stock


In [20]:
# Find most frequently occurring rating as average
frequency_rating = amazon_df['rating'].mode()[0]
frequency_rating

'4.1 out of 5 stars'

In [21]:
# Identifying the garbage rating and replacing it with average ration i.e. '4.5 out of 5 starts'
amazon_df['rating'] = amazon_df['rating'].replace('Previous page', frequency_rating)
amazon_df

Unnamed: 0,_id,title,price,rating,reviews,availability
0,644c3ccb499890cfbaf2a400,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
1,644c3ccb499890cfbaf2a401,"Samsung Galaxy S23 5G (Phantom Black, 8GB, 256...",79999.0,4.5 out of 5 stars,699 ratings,In stock
2,644c3ccb499890cfbaf2a402,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, )",19999.0,3.6 out of 5 stars,18 ratings,In stock
3,644c3ccb499890cfbaf2a403,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
4,644c3ccb499890cfbaf2a404,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, )",21999.0,3.6 out of 5 stars,18 ratings,In stock
...,...,...,...,...,...,...
529,644c3ccb499890cfbaf2a611,Redmi 9A (Midnight Black ) | 2GHz Octa-core He...,7480.0,4.1 out of 5 stars,"322,513 ratings",In stock
530,644c3ccb499890cfbaf2a612,"Samsung Galaxy M04 Dark Blue, | Upto 8GB RAM ...",8499.0,3.9 out of 5 stars,"3,886 ratings",In stock
531,644c3ccb499890cfbaf2a613,"realme narzo 50 (Speed Black, ) Helio G96 Proc...",15499.0,4.2 out of 5 stars,"16,097 ratings",In stock
532,644c3ccb499890cfbaf2a614,"Oppo A78 5G (Glowing Black, 8GB RAM, 128 Stora...",18999.0,4.1 out of 5 stars,914 ratings,In stock


In [22]:
#Unique values for rating column
amazon_df['rating'].unique()

array(['3.9 out of 5 stars', '4.5 out of 5 stars', '3.6 out of 5 stars',
       '4.1 out of 5 stars', '4.0 out of 5 stars', '4.3 out of 5 stars',
       '4.4 out of 5 stars', '4.8 out of 5 stars', '4.2 out of 5 stars'],
      dtype=object)

In [23]:
# Extracting the 1st 3 characters of rating column to get rid of repetative string 'out of 5 stars' numeric value
amazon_df['rating'] = amazon_df['rating'].apply(lambda x: float(str(x[:3]))) 
amazon_df

Unnamed: 0,_id,title,price,rating,reviews,availability
0,644c3ccb499890cfbaf2a400,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9,"3,886 ratings",In stock
1,644c3ccb499890cfbaf2a401,"Samsung Galaxy S23 5G (Phantom Black, 8GB, 256...",79999.0,4.5,699 ratings,In stock
2,644c3ccb499890cfbaf2a402,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, )",19999.0,3.6,18 ratings,In stock
3,644c3ccb499890cfbaf2a403,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9,"3,886 ratings",In stock
4,644c3ccb499890cfbaf2a404,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, )",21999.0,3.6,18 ratings,In stock
...,...,...,...,...,...,...
529,644c3ccb499890cfbaf2a611,Redmi 9A (Midnight Black ) | 2GHz Octa-core He...,7480.0,4.1,"322,513 ratings",In stock
530,644c3ccb499890cfbaf2a612,"Samsung Galaxy M04 Dark Blue, | Upto 8GB RAM ...",8499.0,3.9,"3,886 ratings",In stock
531,644c3ccb499890cfbaf2a613,"realme narzo 50 (Speed Black, ) Helio G96 Proc...",15499.0,4.2,"16,097 ratings",In stock
532,644c3ccb499890cfbaf2a614,"Oppo A78 5G (Glowing Black, 8GB RAM, 128 Stora...",18999.0,4.1,914 ratings,In stock


In [24]:
#Convert reviews column in string format
amazon_df['reviews'] = amazon_df['reviews'].apply(lambda x: str(x))
amazon_df

Unnamed: 0,_id,title,price,rating,reviews,availability
0,644c3ccb499890cfbaf2a400,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9,"3,886 ratings",In stock
1,644c3ccb499890cfbaf2a401,"Samsung Galaxy S23 5G (Phantom Black, 8GB, 256...",79999.0,4.5,699 ratings,In stock
2,644c3ccb499890cfbaf2a402,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, )",19999.0,3.6,18 ratings,In stock
3,644c3ccb499890cfbaf2a403,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9,"3,886 ratings",In stock
4,644c3ccb499890cfbaf2a404,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, )",21999.0,3.6,18 ratings,In stock
...,...,...,...,...,...,...
529,644c3ccb499890cfbaf2a611,Redmi 9A (Midnight Black ) | 2GHz Octa-core He...,7480.0,4.1,"322,513 ratings",In stock
530,644c3ccb499890cfbaf2a612,"Samsung Galaxy M04 Dark Blue, | Upto 8GB RAM ...",8499.0,3.9,"3,886 ratings",In stock
531,644c3ccb499890cfbaf2a613,"realme narzo 50 (Speed Black, ) Helio G96 Proc...",15499.0,4.2,"16,097 ratings",In stock
532,644c3ccb499890cfbaf2a614,"Oppo A78 5G (Glowing Black, 8GB RAM, 128 Stora...",18999.0,4.1,914 ratings,In stock


In [25]:
# Delete ratings word and comma from reviews
amazon_df['reviews'] = amazon_df['reviews'].apply(lambda x: x[:-7])
amazon_df['reviews'] = amazon_df['reviews'].apply(lambda x: x.replace(",", ""))
amazon_df

Unnamed: 0,_id,title,price,rating,reviews,availability
0,644c3ccb499890cfbaf2a400,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9,3886,In stock
1,644c3ccb499890cfbaf2a401,"Samsung Galaxy S23 5G (Phantom Black, 8GB, 256...",79999.0,4.5,699,In stock
2,644c3ccb499890cfbaf2a402,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, )",19999.0,3.6,18,In stock
3,644c3ccb499890cfbaf2a403,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9,3886,In stock
4,644c3ccb499890cfbaf2a404,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, )",21999.0,3.6,18,In stock
...,...,...,...,...,...,...
529,644c3ccb499890cfbaf2a611,Redmi 9A (Midnight Black ) | 2GHz Octa-core He...,7480.0,4.1,322513,In stock
530,644c3ccb499890cfbaf2a612,"Samsung Galaxy M04 Dark Blue, | Upto 8GB RAM ...",8499.0,3.9,3886,In stock
531,644c3ccb499890cfbaf2a613,"realme narzo 50 (Speed Black, ) Helio G96 Proc...",15499.0,4.2,16097,In stock
532,644c3ccb499890cfbaf2a614,"Oppo A78 5G (Glowing Black, 8GB RAM, 128 Stora...",18999.0,4.1,914,In stock


In [26]:
#Covert reviws in int format
amazon_df['reviews'] = amazon_df['reviews'].apply(lambda x: int(x))
amazon_df

Unnamed: 0,_id,title,price,rating,reviews,availability
0,644c3ccb499890cfbaf2a400,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9,3886,In stock
1,644c3ccb499890cfbaf2a401,"Samsung Galaxy S23 5G (Phantom Black, 8GB, 256...",79999.0,4.5,699,In stock
2,644c3ccb499890cfbaf2a402,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, )",19999.0,3.6,18,In stock
3,644c3ccb499890cfbaf2a403,"Samsung Galaxy M04 Light Green, | Upto 8GB RA...",8499.0,3.9,3886,In stock
4,644c3ccb499890cfbaf2a404,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, )",21999.0,3.6,18,In stock
...,...,...,...,...,...,...
529,644c3ccb499890cfbaf2a611,Redmi 9A (Midnight Black ) | 2GHz Octa-core He...,7480.0,4.1,322513,In stock
530,644c3ccb499890cfbaf2a612,"Samsung Galaxy M04 Dark Blue, | Upto 8GB RAM ...",8499.0,3.9,3886,In stock
531,644c3ccb499890cfbaf2a613,"realme narzo 50 (Speed Black, ) Helio G96 Proc...",15499.0,4.2,16097,In stock
532,644c3ccb499890cfbaf2a614,"Oppo A78 5G (Glowing Black, 8GB RAM, 128 Stora...",18999.0,4.1,914,In stock


## Load - Loading the cleaned data in mysql (Structured)

In [27]:
# Install pymysql
# pip install pymysql

In [31]:
# Import necessary libraries
import pymysql
import pandas as pd
from sqlalchemy import create_engine

# create database connection string
db_connection_str = 'mysql+pymysql://root:root@localhost:3306/dap_project'

# create sqlalchemy engine
engine = create_engine(db_connection_str)

# create database if it doesn't exist
with engine.connect() as con:
    con.execute('CREATE DATABASE IF NOT EXISTS dap_project')

# switch to database
with engine.connect() as con:
    con.execute('USE dap_project')

# load data to mysql table
amazon_df.to_sql(name='amazon_tb', con=engine, if_exists='replace', index=False)


534

In [34]:
# check the table
pd.read_sql('SHOW TABLES', engine)

Unnamed: 0,Tables_in_dap_project
0,amazon_tb


#  ETL Process for Flipkart Dataset

## Extract - Extracting data from MongoDB(Semi-structured)

In [35]:
#connect to MongoDB server

#client = pymongo.MongoClient('mongodb://localhost:27017/', serverSelectionTimeoutMS=50000)
#database_name = 'DAP_Project'
#collection_list = client.list_database_names()
#collection_list

In [36]:
# Check if database name exist - 
    #if yes - then it will access collection and load data in Amazon_data
    #if no - it will raise ValueError
    
if database_name in collection_list:
    db = client[database_name]
    Flipkart_collection = db['Flipkart']
    Flipkart_data = Flipkart_collection.find()
else:
    raise ValueError(f"Database {database_name} not found in the collection list")

In [37]:
# Convert data into pandas DataFrame
flipkart_df =pd.DataFrame(list(Flipkart_data))

In [38]:
flipkart_df

Unnamed: 0,_id,title,price,rating,camera_rating,rating_count
0,644c43f1499890cfbaf2a617,"SAMSUNG Galaxy F13 (Nightsky Green, 64 GB) (4...","₹9,699",4.4,,"1,39,579 Ratings &"
1,644c43f1499890cfbaf2a618,"SAMSUNG Galaxy F13 (Waterfall Blue, 64 GB) (4...","₹9,699",4.4,,"1,39,579 Ratings &"
2,644c43f1499890cfbaf2a619,"SAMSUNG Galaxy F13 (Sunrise Copper, 64 GB) (4...","₹9,699",4.4,,"1,39,579 Ratings &"
3,644c43f1499890cfbaf2a61a,"SAMSUNG Galaxy F13 (Waterfall Blue, 128 GB) (...","₹10,699",4.4,,"1,39,579 Ratings &"
4,644c43f1499890cfbaf2a61b,"POCO C50 (Royal Blue, 32 GB) (2 GB RAM)","₹6,499",4.3,,"18,639 Ratings &"
...,...,...,...,...,...,...
979,644c43f1499890cfbaf2a9ea,"realme C31 (Light Silver, 32 GB) (3 GB RAM)","₹9,299",4.5,,"47,541 Ratings &"
980,644c43f1499890cfbaf2a9eb,itel iPower 430 Triple sim (Orange),"₹1,499",3.9,,53 Ratings &
981,644c43f1499890cfbaf2a9ec,Kechaoda K28 (Coffee),"₹1,399",4.1,,"2,922 Ratings &"
982,644c43f1499890cfbaf2a9ed,LAVA A7 (Sea Green),"₹1,799",4.2,,"6,984 Ratings &"


## Transform - Cleaning of data

In [39]:
#Chcek null values
flipkart_df.isnull().sum()

_id              0
title            0
price            0
rating           0
camera_rating    0
rating_count     0
dtype: int64

In [40]:
# Dataframe contains blank spaces so replace blank with none.
flipkart_df['price'] = flipkart_df['price'].replace('', None)
flipkart_df['rating'] = flipkart_df['rating'].replace('', None)
flipkart_df['camera_rating'] = flipkart_df['camera_rating'].replace('', None)
flipkart_df['rating_count'] = flipkart_df['rating_count'].replace('', None)

In [41]:
flipkart_df.isnull().sum()

_id                0
title              0
price              0
rating            32
camera_rating    984
rating_count      32
dtype: int64

In [42]:
#drop column camera_rating as it does not have any record.
flipkart_df = flipkart_df.drop(["camera_rating"], axis=1)

In [43]:
flipkart_df

Unnamed: 0,_id,title,price,rating,rating_count
0,644c43f1499890cfbaf2a617,"SAMSUNG Galaxy F13 (Nightsky Green, 64 GB) (4...","₹9,699",4.4,"1,39,579 Ratings &"
1,644c43f1499890cfbaf2a618,"SAMSUNG Galaxy F13 (Waterfall Blue, 64 GB) (4...","₹9,699",4.4,"1,39,579 Ratings &"
2,644c43f1499890cfbaf2a619,"SAMSUNG Galaxy F13 (Sunrise Copper, 64 GB) (4...","₹9,699",4.4,"1,39,579 Ratings &"
3,644c43f1499890cfbaf2a61a,"SAMSUNG Galaxy F13 (Waterfall Blue, 128 GB) (...","₹10,699",4.4,"1,39,579 Ratings &"
4,644c43f1499890cfbaf2a61b,"POCO C50 (Royal Blue, 32 GB) (2 GB RAM)","₹6,499",4.3,"18,639 Ratings &"
...,...,...,...,...,...
979,644c43f1499890cfbaf2a9ea,"realme C31 (Light Silver, 32 GB) (3 GB RAM)","₹9,299",4.5,"47,541 Ratings &"
980,644c43f1499890cfbaf2a9eb,itel iPower 430 Triple sim (Orange),"₹1,499",3.9,53 Ratings &
981,644c43f1499890cfbaf2a9ec,Kechaoda K28 (Coffee),"₹1,399",4.1,"2,922 Ratings &"
982,644c43f1499890cfbaf2a9ed,LAVA A7 (Sea Green),"₹1,799",4.2,"6,984 Ratings &"


In [44]:
flipkart_df.isnull().sum()

_id              0
title            0
price            0
rating          32
rating_count    32
dtype: int64

In [45]:
# Fill null values with 0
flipkart_df.fillna(0, inplace=True)

In [46]:
flipkart_df.isnull().sum()

_id             0
title           0
price           0
rating          0
rating_count    0
dtype: int64

In [47]:
#Cleaning rating_count colum
flipkart_df["rating_count"] = flipkart_df["rating_count"].str.rstrip('Ratings &')

In [48]:
flipkart_df

Unnamed: 0,_id,title,price,rating,rating_count
0,644c43f1499890cfbaf2a617,"SAMSUNG Galaxy F13 (Nightsky Green, 64 GB) (4...","₹9,699",4.4,139579
1,644c43f1499890cfbaf2a618,"SAMSUNG Galaxy F13 (Waterfall Blue, 64 GB) (4...","₹9,699",4.4,139579
2,644c43f1499890cfbaf2a619,"SAMSUNG Galaxy F13 (Sunrise Copper, 64 GB) (4...","₹9,699",4.4,139579
3,644c43f1499890cfbaf2a61a,"SAMSUNG Galaxy F13 (Waterfall Blue, 128 GB) (...","₹10,699",4.4,139579
4,644c43f1499890cfbaf2a61b,"POCO C50 (Royal Blue, 32 GB) (2 GB RAM)","₹6,499",4.3,18639
...,...,...,...,...,...
979,644c43f1499890cfbaf2a9ea,"realme C31 (Light Silver, 32 GB) (3 GB RAM)","₹9,299",4.5,47541
980,644c43f1499890cfbaf2a9eb,itel iPower 430 Triple sim (Orange),"₹1,499",3.9,53
981,644c43f1499890cfbaf2a9ec,Kechaoda K28 (Coffee),"₹1,399",4.1,2922
982,644c43f1499890cfbaf2a9ed,LAVA A7 (Sea Green),"₹1,799",4.2,6984


In [58]:
flipkart_df['rating_count'] = flipkart_df['rating_count'].str.replace(',','').astype(float)

In [59]:
flipkart_df

Unnamed: 0,_id,title,price,rating,rating_count
0,644c43f1499890cfbaf2a617,"SAMSUNG Galaxy F13 (Nightsky Green, 64 GB) (4...",9699.0,4.4,139579.0
1,644c43f1499890cfbaf2a618,"SAMSUNG Galaxy F13 (Waterfall Blue, 64 GB) (4...",9699.0,4.4,139579.0
2,644c43f1499890cfbaf2a619,"SAMSUNG Galaxy F13 (Sunrise Copper, 64 GB) (4...",9699.0,4.4,139579.0
3,644c43f1499890cfbaf2a61a,"SAMSUNG Galaxy F13 (Waterfall Blue, 128 GB) (...",10699.0,4.4,139579.0
4,644c43f1499890cfbaf2a61b,"POCO C50 (Royal Blue, 32 GB) (2 GB RAM)",6499.0,4.3,18639.0
...,...,...,...,...,...
979,644c43f1499890cfbaf2a9ea,"realme C31 (Light Silver, 32 GB) (3 GB RAM)",9299.0,4.5,47541.0
980,644c43f1499890cfbaf2a9eb,itel iPower 430 Triple sim (Orange),1499.0,3.9,53.0
981,644c43f1499890cfbaf2a9ec,Kechaoda K28 (Coffee),1399.0,4.1,2922.0
982,644c43f1499890cfbaf2a9ed,LAVA A7 (Sea Green),1799.0,4.2,6984.0


In [60]:
flipkart_df['price'] = flipkart_df['price'].str.replace('₹','').str.replace(',','').astype(float)

AttributeError: Can only use .str accessor with string values!

In [61]:
flipkart_df

Unnamed: 0,_id,title,price,rating,rating_count
0,644c43f1499890cfbaf2a617,"SAMSUNG Galaxy F13 (Nightsky Green, 64 GB) (4...",9699.0,4.4,139579.0
1,644c43f1499890cfbaf2a618,"SAMSUNG Galaxy F13 (Waterfall Blue, 64 GB) (4...",9699.0,4.4,139579.0
2,644c43f1499890cfbaf2a619,"SAMSUNG Galaxy F13 (Sunrise Copper, 64 GB) (4...",9699.0,4.4,139579.0
3,644c43f1499890cfbaf2a61a,"SAMSUNG Galaxy F13 (Waterfall Blue, 128 GB) (...",10699.0,4.4,139579.0
4,644c43f1499890cfbaf2a61b,"POCO C50 (Royal Blue, 32 GB) (2 GB RAM)",6499.0,4.3,18639.0
...,...,...,...,...,...
979,644c43f1499890cfbaf2a9ea,"realme C31 (Light Silver, 32 GB) (3 GB RAM)",9299.0,4.5,47541.0
980,644c43f1499890cfbaf2a9eb,itel iPower 430 Triple sim (Orange),1499.0,3.9,53.0
981,644c43f1499890cfbaf2a9ec,Kechaoda K28 (Coffee),1399.0,4.1,2922.0
982,644c43f1499890cfbaf2a9ed,LAVA A7 (Sea Green),1799.0,4.2,6984.0


## Load - Loading the cleaned data in mysql (Structured)

In [51]:
import pymysql
import pandas
from sqlalchemy import create_engine 

In [62]:
# load data to mysql table
flipkart_df.to_sql(name='flipkart_tb', con=engine, if_exists='replace', index=False)

984

In [63]:
# check the table
pd.read_sql('SHOW TABLES', engine)

Unnamed: 0,Tables_in_dap_project
0,amazon_tb
1,flipkart_tb


# ETL process for other Mobile_data

## Extract - Extracting data from MongoDB(Semi-structured) 

In [64]:
##client = pymongo.MongoClient('mongodb://localhost:27017/', serverSelectionTimeoutMS=50000)
##database_name = 'DAP_Project'
##collection_list = client.list_database_names()
##collection_list

In [65]:
# Check if database name exist - 
    #if yes - then it will access collection and load data in Amazon_data
    #if no - it will raise ValueError
    
if database_name in collection_list:
    db = client[database_name]
    Mobile_collection = db['Mobile_data']
    Mobile_data = Mobile_collection.find()
else:
    raise ValueError(f"Database {database_name} not found in the collection list")

In [66]:
##convert data into pandas DataFrame
mobile_df = pd.DataFrame(list(Mobile_data))

In [67]:
mobile_df

Unnamed: 0,_id,asin,brand,title,url,image,rating,reviewUrl,totalReviews,price,originalPrice
0,644c43f1499890cfbaf2a9f0,B0000SX2UC,,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.00,0.00
1,644c43f1499890cfbaf2a9f1,B0009N5L7K,Motorola,Motorola I265 phone,https://www.amazon.com/Motorola-i265-I265-phon...,https://m.media-amazon.com/images/I/419WBAVDAR...,3.0,https://www.amazon.com/product-reviews/B0009N5L7K,7,49.95,0.00
2,644c43f1499890cfbaf2a9f2,B000SKTZ0S,Motorola,MOTOROLA C168i AT&T CINGULAR PREPAID GOPHONE C...,https://www.amazon.com/MOTOROLA-C168i-CINGULAR...,https://m.media-amazon.com/images/I/71b+q3ydkI...,2.7,https://www.amazon.com/product-reviews/B000SKTZ0S,22,99.99,0.00
3,644c43f1499890cfbaf2a9f3,B001AO4OUC,Motorola,Motorola i335 Cell Phone Boost Mobile,https://www.amazon.com/Motorola-i335-Phone-Boo...,https://m.media-amazon.com/images/I/710UO8gdT+...,3.3,https://www.amazon.com/product-reviews/B001AO4OUC,21,0.00,0.00
4,644c43f1499890cfbaf2a9f4,B001DCJAJG,Motorola,Motorola V365 no contract cellular phone AT&T,https://www.amazon.com/Motorola-V365-contract-...,https://m.media-amazon.com/images/I/61LYNCVrrK...,3.1,https://www.amazon.com/product-reviews/B001DCJAJG,12,149.99,0.00
...,...,...,...,...,...,...,...,...,...,...,...
715,644c43f1499890cfbaf2acbb,B07ZPKZSSC,Apple,"Apple iPhone 11 Pro, 64GB, Fully Unlocked - Sp...",https://www.amazon.com/Apple-iPhone-64GB-Fully...,https://m.media-amazon.com/images/I/41wDuEW9iZ...,1.0,https://www.amazon.com/product-reviews/B07ZPKZSSC,1,949.00,0.00
716,644c43f1499890cfbaf2acbc,B07ZQSGP53,Xiaomi,"Xiaomi Redmi Note 8, 32GB/3GB RAM 6.3"" FHD+ Di...",https://www.amazon.com/Xiaomi-Display-Snapdrag...,https://m.media-amazon.com/images/I/41foh4FKHE...,4.6,https://www.amazon.com/product-reviews/B07ZQSGP53,3,150.96,0.00
717,644c43f1499890cfbaf2acbd,B081H6STQQ,Sony,Sony Xperia 1 Unlocked Smartphone and WH1000XM...,https://www.amazon.com/Sony-Smartphone-WH1000X...,https://m.media-amazon.com/images/I/51zZTAXZTP...,4.5,https://www.amazon.com/product-reviews/B081H6STQQ,70,948.00,0.00
718,644c43f1499890cfbaf2acbe,B081TJFVCJ,Apple,"Apple iPhone X, 64GB, Gray - Fully Unlocked (R...",https://www.amazon.com/Apple-iPhone-64GB-Gray-...,https://m.media-amazon.com/images/I/71yMgOenT5...,5.0,https://www.amazon.com/product-reviews/B081TJFVCJ,1,478.97,0.00


## Transform - Cleaning of data

In [68]:
#Check if null values exists
mobile_df.isna().sum()


_id              0
asin             0
brand            4
title            0
url              0
image            0
rating           0
reviewUrl        0
totalReviews     0
price            0
originalPrice    0
dtype: int64

In [69]:
# Drop records having values null
mobile_df = mobile_df.dropna()

In [70]:
mobile_df.isna().sum()

_id              0
asin             0
brand            0
title            0
url              0
image            0
rating           0
reviewUrl        0
totalReviews     0
price            0
originalPrice    0
dtype: int64

In [71]:
# Dropped columns 'reviewUrl','url','image' for cleaning data as per requirement
mobile_df = mobile_df.drop(['reviewUrl','url','image'],axis=1)

In [72]:
mobile_df

Unnamed: 0,_id,asin,brand,title,rating,totalReviews,price,originalPrice
1,644c43f1499890cfbaf2a9f1,B0009N5L7K,Motorola,Motorola I265 phone,3.0,7,49.95,0.00
2,644c43f1499890cfbaf2a9f2,B000SKTZ0S,Motorola,MOTOROLA C168i AT&T CINGULAR PREPAID GOPHONE C...,2.7,22,99.99,0.00
3,644c43f1499890cfbaf2a9f3,B001AO4OUC,Motorola,Motorola i335 Cell Phone Boost Mobile,3.3,21,0.00,0.00
4,644c43f1499890cfbaf2a9f4,B001DCJAJG,Motorola,Motorola V365 no contract cellular phone AT&T,3.1,12,149.99,0.00
5,644c43f1499890cfbaf2a9f5,B001GQ3DJM,Nokia,Nokia 1680 Black Phone (T-Mobile),2.7,3,0.00,0.00
...,...,...,...,...,...,...,...,...
715,644c43f1499890cfbaf2acbb,B07ZPKZSSC,Apple,"Apple iPhone 11 Pro, 64GB, Fully Unlocked - Sp...",1.0,1,949.00,0.00
716,644c43f1499890cfbaf2acbc,B07ZQSGP53,Xiaomi,"Xiaomi Redmi Note 8, 32GB/3GB RAM 6.3"" FHD+ Di...",4.6,3,150.96,0.00
717,644c43f1499890cfbaf2acbd,B081H6STQQ,Sony,Sony Xperia 1 Unlocked Smartphone and WH1000XM...,4.5,70,948.00,0.00
718,644c43f1499890cfbaf2acbe,B081TJFVCJ,Apple,"Apple iPhone X, 64GB, Gray - Fully Unlocked (R...",5.0,1,478.97,0.00


In [73]:
# Coverting price in dollar into rupees
exchange_rate = 80
mobile_df['Price'] = mobile_df['price'] * exchange_rate
mobile_df['original_price'] = mobile_df['originalPrice']*exchange_rate

In [74]:
mobile_df

Unnamed: 0,_id,asin,brand,title,rating,totalReviews,price,originalPrice,Price,original_price
1,644c43f1499890cfbaf2a9f1,B0009N5L7K,Motorola,Motorola I265 phone,3.0,7,49.95,0.00,3996.0,0.0
2,644c43f1499890cfbaf2a9f2,B000SKTZ0S,Motorola,MOTOROLA C168i AT&T CINGULAR PREPAID GOPHONE C...,2.7,22,99.99,0.00,7999.2,0.0
3,644c43f1499890cfbaf2a9f3,B001AO4OUC,Motorola,Motorola i335 Cell Phone Boost Mobile,3.3,21,0.00,0.00,0.0,0.0
4,644c43f1499890cfbaf2a9f4,B001DCJAJG,Motorola,Motorola V365 no contract cellular phone AT&T,3.1,12,149.99,0.00,11999.2,0.0
5,644c43f1499890cfbaf2a9f5,B001GQ3DJM,Nokia,Nokia 1680 Black Phone (T-Mobile),2.7,3,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
715,644c43f1499890cfbaf2acbb,B07ZPKZSSC,Apple,"Apple iPhone 11 Pro, 64GB, Fully Unlocked - Sp...",1.0,1,949.00,0.00,75920.0,0.0
716,644c43f1499890cfbaf2acbc,B07ZQSGP53,Xiaomi,"Xiaomi Redmi Note 8, 32GB/3GB RAM 6.3"" FHD+ Di...",4.6,3,150.96,0.00,12076.8,0.0
717,644c43f1499890cfbaf2acbd,B081H6STQQ,Sony,Sony Xperia 1 Unlocked Smartphone and WH1000XM...,4.5,70,948.00,0.00,75840.0,0.0
718,644c43f1499890cfbaf2acbe,B081TJFVCJ,Apple,"Apple iPhone X, 64GB, Gray - Fully Unlocked (R...",5.0,1,478.97,0.00,38317.6,0.0


In [75]:
# Dropping columns which was in dollar currency
mobile_df = mobile_df.drop(['price','originalPrice'],axis=1)

In [76]:
mobile_df

Unnamed: 0,_id,asin,brand,title,rating,totalReviews,Price,original_price
1,644c43f1499890cfbaf2a9f1,B0009N5L7K,Motorola,Motorola I265 phone,3.0,7,3996.0,0.0
2,644c43f1499890cfbaf2a9f2,B000SKTZ0S,Motorola,MOTOROLA C168i AT&T CINGULAR PREPAID GOPHONE C...,2.7,22,7999.2,0.0
3,644c43f1499890cfbaf2a9f3,B001AO4OUC,Motorola,Motorola i335 Cell Phone Boost Mobile,3.3,21,0.0,0.0
4,644c43f1499890cfbaf2a9f4,B001DCJAJG,Motorola,Motorola V365 no contract cellular phone AT&T,3.1,12,11999.2,0.0
5,644c43f1499890cfbaf2a9f5,B001GQ3DJM,Nokia,Nokia 1680 Black Phone (T-Mobile),2.7,3,0.0,0.0
...,...,...,...,...,...,...,...,...
715,644c43f1499890cfbaf2acbb,B07ZPKZSSC,Apple,"Apple iPhone 11 Pro, 64GB, Fully Unlocked - Sp...",1.0,1,75920.0,0.0
716,644c43f1499890cfbaf2acbc,B07ZQSGP53,Xiaomi,"Xiaomi Redmi Note 8, 32GB/3GB RAM 6.3"" FHD+ Di...",4.6,3,12076.8,0.0
717,644c43f1499890cfbaf2acbd,B081H6STQQ,Sony,Sony Xperia 1 Unlocked Smartphone and WH1000XM...,4.5,70,75840.0,0.0
718,644c43f1499890cfbaf2acbe,B081TJFVCJ,Apple,"Apple iPhone X, 64GB, Gray - Fully Unlocked (R...",5.0,1,38317.6,0.0


# Load - Loading the cleaned data in mysql (Structured)

In [85]:
# load data to mysql table
flipkart_df.to_sql(name='flipkart_tb', con=engine, if_exists='replace', index=False)

984

In [86]:
# check the table
pd.read_sql('SHOW TABLES', engine)

Unnamed: 0,Tables_in_dap_project
0,amazon_tb
1,flipkart_tb
2,mobile_tb
