# Installation and Importing libraries

In [None]:
#You need to start with installing Faker and psycopg2
!pip install Faker
!pip install psycopg2

Collecting Faker
  Downloading Faker-19.9.0-py3-none-any.whl (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.7 MB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m28.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Faker
Successfully installed Faker-19.9.0


In [2]:
#Faker library for synthetic data generation in Indian context
from faker import Faker
fake = Faker('en_IN')

#Libraries to help with SQL connection
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from sqlalchemy import create_engine

#Misc. libraries to work with pandas and others
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import math
import os

#Make a connection to sql

In [3]:
#Set the frequently used credentials
USER = "postgres"
PASSWORD = "123"
HOST = "34.93.118.74"
PORT = "5432"

In [8]:
#Make a connection to the database
conn = psycopg2.connect(
    host = HOST,
    database = 'oltp',
    user = USER,
    password = PASSWORD
)

#Use this to be able to run query without the commit command
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

In [13]:
#Get the current maximum Ids to generate the further dataset.
cur = conn.cursor()

cur.execute('SELECT max(orderid) from order_details;')
max_orderid = cur.fetchall()
max_orderid = max_orderid[0][0]

cur.execute('SELECT max(customerid) from customer_master;')
max_customerid = cur.fetchall()
max_customerid = max_customerid[0][0]

cur.execute('SELECT max(productid) from product_master;')
max_productid = cur.fetchall()
max_productid = max_productid[0][0]

#Get the customer table
customer_master = pd.read_sql(text("select * from customer_master"), conn)

#Order details

In [14]:
#Set the fields for the new df
order_details_fields = ['orderid','customerid','order_status_update_timestamp','order_status']

#Create the new df with 5000 rows for these new orders
orders_Received = pd.DataFrame(columns=order_details_fields, index = range(1,5001))
orders_InProgress = pd.DataFrame(columns=order_details_fields, index = range(1,5001))
orders_Delivered = pd.DataFrame(columns=order_details_fields, index = range(1,5001))

In [None]:
#For order status recieved
max_orderid_temp = max_orderid + 1
for i in range(1, 5001):
    orders_Received['orderid'][i] = max_orderid_temp
    orders_Received['customerid'][i] = fake.random_element(customer_master['customerid'])
    orders_Received['order_status_update_timestamp'][i] = datetime.datetime.today() + timedelta(seconds=fake.random_int(min=1, max=86400))
    orders_Received['order_status'] = 'Received'
    max_orderid_temp = max_orderid_temp + 1



#For order status InProgress
max_orderid_temp = max_orderid + 1
for i in range(1, 5001):
    orders_InProgress['orderid'][i] = max_orderid_temp
    orders_InProgress['customerid'][i] = orders_Received['customerid'][i]
    orders_InProgress['order_status_update_timestamp'][i] = orders_Received['order_status_update_timestamp'][i]
    orders_InProgress['order_status'] = 'InProgress'

#For order status Delivered
max_orderid_temp = max_orderid + 1
for i in range(1, 5001):
    orders_Delivered['orderid'][i] = max_orderid_temp
    orders_Delivered['customerid'][i] = orders_Received['customerid'][i]
    orders_Delivered['order_status_update_timestamp'][i] = orders_InProgress['order_status_update_timestamp'][i]
    orders_Delivered['order_status'] = 'Delivered'

In [None]:
#concat the dataframes
order_details = pd.concat([orders_Received, orders_InProgress, orders_Delivered], axis=0)

#Order items

In [None]:
#Set the fields for the new df
order_items = pd.DataFrame(columns = ['orderid','productid','quantity'], index = range(1, 30001))

In [None]:
max_orderid_temp = max_orderid + 1
for i in range(1, 5001):
    order_items['orderid'][i] = max_orderid_temp
    order_items['productid'][i] = fake.random_int(1,100)
    order_items['quantity'][i] = fake.random_int(1,5)
    max_orderid_temp = max_orderid_temp + 1

# Upload data

In [None]:
# Create a cursor object
cur = conn.cursor()

# Create the sqlalchemy engine
engine = create_engine(f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{'oltp'}")

# Now we use the pandas to_sql command to convert and upload the data to the respective tables
order_details.to_sql('order_details', engine, if_exists='append', index=False)
order_items.to_sql('order_items', engine, if_exists='append', index=False)

#Close the cursor
cur.close()