# Installation and Importing libraries

In [1]:
#You need to start with installing Faker and psycopg2
!pip install Faker
!pip install psycopg2

Collecting Faker
  Downloading Faker-19.8.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Faker
Successfully installed Faker-19.8.0


In [2]:
#Faker library for synthetic data generation in Indian context
from faker import Faker
fake = Faker('en_IN')

#Libraries to help with SQL connection
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from sqlalchemy import create_engine

#Misc. libraries to work with pandas and others
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import math
import os

In [3]:
#Testing faker library
print(fake.email())
print(fake.country())
print(fake.name())
print(fake.text())
print(fake.latitude(), fake.longitude())
print(fake.url())

indrajit17@example.org
Namibia
Divij Salvi
Dolores expedita voluptatum enim sapiente. Qui nisi ut voluptatum asperiores facilis.
Quis porro blanditiis a itaque.
Fugit labore animi.
-38.768192 13.687595
http://kalla-hora.com/


# Customer Table

In [4]:
#Define the respective fields of the customer_master table.
cust_fields = ['customerid', 'name', 'address', 'city', 'state', 'pincode', 'update_timestamp']

#A list of states and cities for data generation.
states = {
          'Bihar': ['Patna', 'Gaya', 'Bhagalpur', 'Muzaffarpur'],
          'Gujarat': ['Ahmedabad', 'Rajkot', 'Surat', 'Vadodara'],
          'Karnataka': ['Bangalore', 'Belgaum', 'Mangalore', 'Mysore'],
          'Maharashtra': ['Mumbai', 'Pune', 'Nagpur', 'Thane'],
          'Odisha': ['Bhubaneswar', 'Cuttack', 'Rourkela', 'Berhampur'],
          'Rajasthan': ['Jaipur', 'Jodhpur', 'Kota', 'Bikaner']
          }

In [5]:
#Generating the 1000 fake rows.
customer_master = pd.DataFrame(columns = cust_fields, index = range(1,1001))

#Use a seed like this to regenerate the same data.
Faker.seed(0)

#We fill the 1000 empty rows with fake data like name and state.
for i in range(1,1001):
    customer_master['customerid'][i] = i          #To keep it simple, the customerid would be from 1 to 1000 (10001 is excluded in range)
    customer_master['name'][i] = fake.name()      #Generates a fake name.
    customer_master['address'][i] = fake.street_address()     #Generates a fake address.
    customer_master['state'][i] = fake.random_element(states.keys())      #We take the dictionary states above and use keys as the custom range.
    customer_master['city'][i] = fake.random_element(states[customer_master['state'][i]])   #Once the state is selected we use the respective list for cities.
    customer_master['pincode'][i] = fake.postcode()   #Generates a fake postcode.
    customer_master['update_timestamp'][i] = fake.date_time_this_year()   #Generates a fake timestamp.

In [6]:
#Check the generated dataframe.
customer_master.head()

Unnamed: 0,customerid,name,address,city,state,pincode,update_timestamp
1,1,Tara Walia,H.No. 487\nSrinivasan Marg,Mysore,Karnataka,938242,2023-02-06 19:53:12
2,2,Hazel Sastry,"H.No. 11, Keer Path",Bhubaneswar,Odisha,659387,2023-06-21 22:12:04
3,3,Biju Swaminathan,"H.No. 609, Thaker Chowk",Belgaum,Karnataka,139332,2023-07-30 21:01:59
4,4,Divij Baria,71/48\nSastry Road,Bhagalpur,Bihar,989471,2023-08-20 13:47:46
5,5,Samar Mani,"H.No. 23, Babu Street",Rourkela,Odisha,112201,2023-09-29 14:09:39


In [7]:
#Check the number of unique values for each attribute.
customer_master.nunique()

customerid          1000
name                 993
address             1000
city                  24
state                  6
pincode             1000
update_timestamp    1000
dtype: int64

In [8]:
#We would not have to deal with null values in our project.
customer_master.isna().sum()

customerid          0
name                0
address             0
city                0
state               0
pincode             0
update_timestamp    0
dtype: int64

# Product Table

In [9]:
#Define the respective fields of the product_fields table.
product_fields = ['productid', 'productcode', 'productname', 'sku', 'rate', 'isactive']

#A list of some brands and products to be mixed and matched.
product_brands = ['Aashirwad', 'Himalaya', 'Tata', 'Reliance', 'Balaji', 'MDH', 'Dabur', 'Patanjali', 'Amul']
product_types = ['Aata', 'Oil', 'Shampoo', 'Rice', 'Salt', 'Cream']

In [10]:
#We would create a df with a 100 products.
product_master = pd.DataFrame(columns = product_fields,index=range(1,101))
Faker.seed(0)

#We fill the 100 empty rows with fake data like productname and sku.
for i in range(1,101):
    product_master['productid'][i] = i      #For convenience we set the product id from 1 to 100.
    product_master['productname'][i] = random.choice(list(product_brands)) + ' ' + random.choice(list(product_types))   #Product name would be a combination of brand and type.
    product_master['productcode'][i] = fake.bothify('?##')      #We use bothify to generate code in the format of 'Letter*num*num'
    product_master['sku'][i] = str(fake.random_int(1,8)) + 'KG'   #Generating fake sku here.
    product_master['rate'][i] = fake.random_int(50,1000)          #Generating fake rate here.
    product_master['isactive'][i] = random.choice(['True','False'])     #Randomly assign active/inactive to products.

In [11]:
#Check the generated dataframe.
product_master.head(100)

Unnamed: 0,productid,productcode,productname,sku,rate,isactive
1,1,c66,Amul Aata,5KG,573,True
2,2,Y76,Balaji Oil,5KG,538,True
3,3,n59,Reliance Shampoo,3KG,338,False
4,4,N21,Reliance Cream,5KG,981,False
5,5,j89,MDH Salt,5KG,151,False
...,...,...,...,...,...,...
96,96,Y49,Himalaya Cream,3KG,912,True
97,97,s69,Aashirwad Oil,8KG,117,False
98,98,c18,Aashirwad Aata,2KG,280,False
99,99,t20,Reliance Aata,1KG,827,False


# Order Details

In [12]:
#Define the respective fields of the order_detail_fields table.
order_detail_fields = ['orderid','customerid','order_status_update_timestamp','order_status']

In [13]:
#Create 3 different df to hold different status of orders.
orders_Received = pd.DataFrame(columns=order_detail_fields, index = range(1,20001))
orders_InProgress = pd.DataFrame(columns=order_detail_fields, index = range(1,20001))
orders_Delivered = pd.DataFrame(columns=order_detail_fields, index = range(1,20001))

In [14]:
#For each df we assign fake order details.
for i in range(1,20001):
    orders_Received['orderid'][i] = i
    orders_Received['customerid'][i] = fake.random_element(customer_master['customerid'])
    orders_Received['order_status_update_timestamp'][i] = fake.date_time_this_year()
    orders_Received['order_status'][i] = 'Received'

for i in range(1,20001):
    orders_InProgress['orderid'][i] = i
    orders_InProgress['customerid'][i] = orders_Received['customerid'][i]
    orders_InProgress['order_status_update_timestamp'][i] = orders_Received['order_status_update_timestamp'][i] + timedelta(seconds=fake.random_int(1, 86400))
    orders_InProgress['order_status'][i] = 'InProgress'

for i in range(1,20001):
    orders_Delivered['orderid'][i] = i
    orders_Delivered['customerid'][i] = orders_Received['customerid'][i]
    orders_Delivered['order_status_update_timestamp'][i] = orders_InProgress['order_status_update_timestamp'][i] + timedelta(seconds=fake.random_int(1, 86400))
    orders_Delivered['order_status'][i] = 'Delivered'

In [15]:
#Here we concat all the three dataframes into one.
order_details = pd.concat([orders_Received, orders_InProgress, orders_Delivered])

In [16]:
#Let's check this newly created df.
order_details.head(10)

Unnamed: 0,orderid,customerid,order_status_update_timestamp,order_status
1,1,672,2023-06-28 22:21:44,Received
2,2,992,2023-05-25 04:58:14,Received
3,3,518,2023-05-29 10:39:56,Received
4,4,923,2023-07-25 18:09:55,Received
5,5,515,2023-01-14 01:20:33,Received
6,6,588,2023-02-05 04:53:24,Received
7,7,695,2023-07-21 04:05:40,Received
8,8,777,2023-08-22 01:43:53,Received
9,9,79,2023-06-15 14:01:31,Received
10,10,928,2023-03-22 01:02:10,Received


In [17]:
#We can see that the df are combined.
order_details.sort_values(by=["orderid"])

Unnamed: 0,orderid,customerid,order_status_update_timestamp,order_status
1,1,672,2023-06-28 22:21:44,Received
1,1,672,2023-06-29 18:46:04,InProgress
1,1,672,2023-06-30 14:06:42,Delivered
2,2,992,2023-05-25 04:58:14,Received
2,2,992,2023-05-25 14:23:56,InProgress
...,...,...,...,...
19999,19999,575,2023-06-29 15:06:08,Received
19999,19999,575,2023-06-29 23:42:06,InProgress
20000,20000,138,2023-04-06 11:51:38,InProgress
20000,20000,138,2023-04-06 04:21:16,Received


In [18]:
#There are 60000 rows in total and 20000 unique orderid.
len(order_details['orderid'].unique())

20000

# Order Items

In [19]:
#Define the respective fields of the order_detail_fields table.
order_item_fields = ['orderid','productid','quantity']

In [20]:
#We create a dataframe with 30000 rows.
order_items = pd.DataFrame(columns=order_item_fields, index = range(1, 30001))

#Fill fake values in all of the rows.
for i in range(1,30001):
    order_items['orderid'][i] = fake.random_int(1,20000)
    order_items['productid'][i] = fake.random_int(1,100)
    order_items['quantity'][i] = fake.random_int(1,5)

In [21]:
order_items.sort_values(by=['orderid'])

Unnamed: 0,orderid,productid,quantity
7125,2,16,5
9292,2,27,3
144,2,14,3
24853,3,36,5
15495,4,62,4
...,...,...,...
20404,19993,23,2
24784,19993,9,5
12436,19995,10,1
1792,19996,89,5


In [22]:
len(order_items['orderid'].unique())

15594

# Upload data in SQL instance

- Create a new Postgresql instance
```
gcloud sql instances create [INSTANCE_NAME] --database-version=POSTGRES_13 --region=[REGION]
```

- List all the current sql instances
```
gcloud sql instances list --filter="databaseVersion:POSTGRES*"
```

- Connect to the sql instance
```
gcloud sql connect [INSTANCE_NAME] --user=postgres
```

- List databases and connect to one. Then list the tables inside it
```
\l
\c [DATABASE_NAME]
\dt
```

In [23]:
#Set the frequently used credentials
USER = "postgres"
PASSWORD = "123"
HOST = "34.93.118.74"
PORT = "5432"

In [25]:
# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = 'postgres',
    user = USER,
    password = PASSWORD,
    host = HOST
)

#Use this to be able to create a new database without the commit command
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

# Create a cursor object
cur = conn.cursor()

# Execute a query to create a new database
cur.execute("""
        CREATE DATABASE oltp;
        """)

#Uncomment and run this to delete the database
"""
cur.execute('''
              DROP DATABASE oltp WITH (force);
            ''')
"""

#Close the cursor
cur.close()

In [26]:
# Establish a connection to the oltp database we just created
conn = psycopg2.connect(
    dbname = 'oltp',
    user = USER,
    password = PASSWORD,
    host = HOST
)

#Use this to be able to create a new database without the commit command
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

# Create a cursor object
cur = conn.cursor()

#Define the table schema
create_table_customer = """ CREATE TABLE IF NOT EXISTS customer_master(
                                customerid INT PRIMARY KEY,
                                name VARCHAR(50),
                                address VARCHAR(100),
                                city VARCHAR(30),
                                state VARCHAR(30),
                                pincode INT,
                                update_timestamp TIMESTAMP)"""

create_table_product = """ CREATE TABLE IF NOT EXISTS product_master(
                                productid INT PRIMARY KEY,
                                productcode VARCHAR(100),
                                productname VARCHAR(100),
                                sku VARCHAR(100),
                                rate INT,
                                isactive BOOLEAN)"""

create_table_order_details = """ CREATE TABLE IF NOT EXISTS order_details(
                                orderid INT,
                                customerid INT,
                                order_status_update_timestamp TIMESTAMP,
                                order_status VARCHAR(20),
                                FOREIGN KEY (customerid) REFERENCES CUSTOMER_MASTER(customerid)
                                )"""

create_table_order_items = """ CREATE TABLE IF NOT EXISTS order_items(
                                orderid INT NOT NULL,
                                productid INT,
                                quantity INT,
                                FOREIGN KEY (productid) REFERENCES PRODUCT_MASTER(productid)
                                )"""

#Create the tables
cur.execute(create_table_customer)
cur.execute(create_table_product)
cur.execute(create_table_order_details)
cur.execute(create_table_order_items)

#Close the cursor
cur.close()

In [27]:
# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = 'oltp',
    user = USER,
    password = PASSWORD,
    host = HOST
)

#Use this to be able to create a new database without the commit command
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

# Create a cursor object
cur = conn.cursor()

# Create the sqlalchemy engine
engine = create_engine(f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{'oltp'}")

# Now we use the pandas to_sql command to convert and upload the data to the respective tables
customer_master.to_sql('customer_master', engine, if_exists='append', index=False)
product_master.to_sql('product_master', engine, if_exists='append', index=False)
order_details.to_sql('order_details', engine, if_exists='append', index=False)
order_items.to_sql('order_items', engine, if_exists='append', index=False)

#Close the cursor
cur.close()