In [2]:
import pandas as pd
import numpy as np

In [3]:
customers = pd.read_csv('data_csv/Customers.csv', encoding='unicode_escape')
data_dictionary = pd.read_csv('data_csv/Data_Dictionary.csv')
exchange_rates = pd.read_csv('data_csv/Exchange_Rates.csv')
products = pd.read_csv('data_csv/Products.csv')
sales =  pd.read_csv('data_csv/Sales.csv')
stores = pd.read_csv('data_csv/Stores.csv')


In [4]:
#preprocessing customer table

#standardizing column_names
def standardize_col_names(df: pd.DataFrame)->pd.DataFrame:
        col_names = {val:val.lower().replace(' ','_').strip() for val in df.columns}
        df.rename(columns=col_names,inplace=True)
        return df


def replace_nan_with_na(row):
    if row['state'] == 'Napoli' and pd.isna(row['state_code']):
        row['state_code'] = 'NA'
    return row

def format_date_column(row,date_col_name):
    if not pd.isna(row[date_col_name]):
        date = row[date_col_name].split('/')
        row[date_col_name] = f"{date[-1]}-{date[0]}-{date[1]}"
    return row
        

#standardizing column names
customers = standardize_col_names(customers)
#imputing null values
customers = customers.apply(replace_nan_with_na, axis=1)
#formating birthday in sql 
customers= customers.apply(lambda row : format_date_column(row,date_col_name='birthday'),axis = 1)


In [5]:
#preprocessing products table

def create_product_id(row):
    index = product_category[(product_category['Subcategory'] == row['Subcategory']) & (product_category['Category'] == row['Category'])]['index']
    return index.item()

product_category = products.iloc[:,[-3,-1]].drop_duplicates().reset_index(drop=True).reset_index()
products['category_id'] = [create_product_id(row[1]) for row in products.iterrows()]
products = products.drop(['Subcategory','CategoryKey','Category','SubcategoryKey'],axis=1)
products = standardize_col_names(products)
product_category = standardize_col_names(product_category)



In [6]:
#preprocessing sales table
sales = standardize_col_names(sales)
sales= sales.apply(lambda row : format_date_column(row,date_col_name='order_date'),axis = 1)
sales= sales.apply(lambda row : format_date_column(row,date_col_name='delivery_date'),axis = 1)


In [7]:
#preprocessing stores table
stores = standardize_col_names(stores)
stores = stores.apply(lambda row : format_date_column(row,date_col_name='open_date'),axis=1)
stores['square_meters'] = stores['square_meters'].fillna(0)

In [8]:
#preprocessing exchage table
exchange_rates = standardize_col_names(exchange_rates)
exchange_rates = exchange_rates.apply(lambda row : format_date_column(row,date_col_name='date'),axis=1)



In [9]:
#create sql table and dumping data into it

import mysql.connector

con = mysql.connector.connect(
        host='localhost',
        user='root',
        password='password',
        database = 'global_electronics')
cursor = con.cursor()

In [10]:
# create database 
cursor.execute('show databases')
for val in cursor:
    print(val)

cursor.execute('create database if not exists global_electronics')


('global_electronics',)
('information_schema',)
('mysql',)
('performance_schema',)
('red_bus_scrape',)
('sys',)


In [15]:
cursor.execute('show tables')
tables = []
for val in cursor:
    tables.append(val[0])
    print(val)

('customer',)
('exchange_rate',)
('product',)
('product_category',)
('sale',)
('store',)


In [13]:
cursor.execute('''
 CREATE TABLE if not exists customer(
         customer_key int,
         gender varchar(10) NOT NULL,
         name varchar(50) NOT NULL,
         city varchar(50) NOT NULL,
         state_code varchar(5) NOT NULL,
         state varchar(50) NOT NULL,
         zip_code varchar(20) NOT NULL,
         country varchar(30) NOT NULL,
         continent varchar(30) NOT NULL,
         birthday date,
        PRIMARY KEY(customer_key))
''')


cursor.execute('''
                CREATE TABLE if not exists product_category(
                        category_id int,
                        sub_category varchar(50) NOT NULL,
                        category varchar(40) NOT NULL,
                        PRIMARY KEY(category_id),
               )
               ''')

cursor.execute(
    '''
        CREATE TABLE IF NOT EXISTS product(
                product_key int,
                name varchar(100) NOT NULL,
                brand varchar(30) NOT NULL,
                color varchar(20) NOT NULL,
                unit_cost decimal(7,2) NOT NULL,
                unit_price_usd decimal(5,2) NOT NULL,
                category_id int NOT NULL,
                PRIMARY KEY(product_key),
                FOREIGN KEY(category_id) REFERENCES product_category(category_id),
        )
    '''
)

cursor.execute(
    '''
    CREATE TABLE IF NOT EXISTS store(
        store_key int,
        country varchar(20) NOT NULL,
        state varchar(50) NOT NULL,
        square_meters Decimal(10,2) NOT NULL,
        open_date date NOT NULL,
        PRIMARY KEY(store_key),
    )
    ''')



cursor.execute(
    '''
    CREATE TABLE IF NOT EXISTS sale(
        order_name int,
        line_item int NOT NULL,
        order_date date NOT NULL,
        delivery_date date, 
        customer_key int NOT NULL,
        store_key int NOT NULL,
        product_key int NOT NULL,
        quantity int NOT NULL,
        currency_code varchar(6) NOT NULL,
        PRIMARY KEY(order_name, line_item),
        FOREIGN KEY(customer_key) REFERENCES customer(customer_key),
        FOREIGN KEY(store_key) REFERENCES store(store_key),
        FOREIGN KEY(product_key) REFERENCES product(product_key)
    )
    '''
)

cursor.execute(
    '''
    CREATE TABLE IF NOT EXISTS exchange_rate(
    date date,
    currency varchar(7),
    exchange decimal(7,4))
    
    '''
)


#### customers table
- Nopoli's country code is missing and I found that Napoli is also called Naples whose state code is NA.
- convert birthday into Date format for sql.
- normalize the column names.
- try separating state and country into separate table

#### products table
- split the product table further ie subcategory and category.
- normalize the column names.

#### sales table
- delivery date has missing values but we could not fill that.
- normalize the column names.
- create a primary key for sql
- format date columns


#### stores table
- convert the open date to dateformat for sql
- normalize the column names
- handle one NaN value for Online

#### xchange table
- normalize the column names
- convert the 'date' column to date format
