In [3]:
import pandas as pd
import numpy as np
import matplotlib as mp
import seaborn as sn
import csv

In [6]:
brands = pd.read_csv('brands_v2.csv')
finance = pd.read_csv('finance.csv')
info = pd.read_csv('info_v2.csv')
reviews = pd.read_csv('reviews_v2.csv')
traffic = pd.read_csv('traffic_v3.csv')

## Data Layout:

- **Brands**
    - product_id*
    - brand
- **Finance**
    - product_id*
    - listing_price
    - sale_price
    - discount
    - revenue
- **Info**
    - product_name
    - product_id**
    - description
- **Reviews**
    - product_id*
    - rating
    - review
- **Traffic**
    - product_id*
    - last_visited

In [7]:
print("Brands Dataset Info")
brands.info()

x = brands.head()
print('\n')
print("Sample Data")
print(x)

Brands Dataset Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3179 entries, 0 to 3178
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   product_id  3179 non-null   object
 1   brand       3120 non-null   object
dtypes: object(2)
memory usage: 49.8+ KB


Sample Data
  product_id   brand
0     AH2430     NaN
1     G27341  Adidas
2     CM0081  Adidas
3     B44832  Adidas
4     D98205  Adidas


In [8]:
print("Finance Dataset Info")
finance.info()

x = finance.head()
print('\n')
print("Sample Data")
print(x)

Finance Dataset Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3179 entries, 0 to 3178
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     3179 non-null   object 
 1   listing_price  3120 non-null   float64
 2   sale_price     3120 non-null   float64
 3   discount       3120 non-null   float64
 4   revenue        3120 non-null   float64
dtypes: float64(4), object(1)
memory usage: 124.3+ KB


Sample Data
  product_id  listing_price  sale_price  discount  revenue
0     AH2430            NaN         NaN       NaN      NaN
1     G27341          75.99       37.99       0.5  1641.17
2     CM0081           9.99        5.99       0.4   398.93
3     B44832          69.99       34.99       0.5  2204.37
4     D98205          79.99       39.99       0.5  5182.70


In [9]:
print("Info Dataset Info")
info.info()

x = info.head()
print('\n')
print("Sample Data")
print(x)

Info Dataset Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3179 entries, 0 to 3178
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  3120 non-null   object
 1   product_id    3179 non-null   object
 2   description   3117 non-null   object
dtypes: object(3)
memory usage: 74.6+ KB


Sample Data
                                       product_name product_id  \
0                                               NaN     AH2430   
1              Women's adidas Originals Sleek Shoes     G27341   
2                 Women's adidas Swim Puka Slippers     CM0081   
3  Women's adidas Sport Inspired Questar Ride Shoes     B44832   
4          Women's adidas Originals Taekwondo Shoes     D98205   

                                         description  
0                                                NaN  
1  A modern take on adidas sport heritage, tailor...  
2  These adidas Puka slippers for women's come wi

In [10]:
print("Reviews Dataset Info")
reviews.info()

x = reviews.head()
print('\n')
print("Sample Data")
print(x)

Reviews Dataset Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3179 entries, 0 to 3178
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   product_id  3179 non-null   object 
 1   rating      3120 non-null   float64
 2   reviews     3120 non-null   float64
dtypes: float64(2), object(1)
memory usage: 74.6+ KB


Sample Data
  product_id  rating  reviews
0     AH2430     NaN      NaN
1     G27341     3.3     24.0
2     CM0081     2.6     37.0
3     B44832     4.1     35.0
4     D98205     3.5     72.0


In [11]:
print("Traffic Dataset Info")
traffic.info()

x = traffic.head()
print('\n')
print("Sample Data")
print(x)

Traffic Dataset Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3179 entries, 0 to 3178
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_id    3179 non-null   object
 1   last_visited  2928 non-null   object
dtypes: object(2)
memory usage: 49.8+ KB


Sample Data
  product_id         last_visited
0     AH2430  2018-05-19 15:13:00
1     G27341  2018-11-29 16:16:00
2     CM0081  2018-02-01 10:27:00
3     B44832  2018-09-07 20:06:00
4     D98205  2019-07-18 15:26:00


There are 3179 total entries, with product_id not having any nulls. *description* in the *reviews* table has 3117 nulls and last_visited in traffic table has 2928 nulls. The rest of the columns in all tables have 3120 nulls.

*product_id* is the primary key for each table besides the *info* table where the *product_name* is the primary key and the *product_id* is the foreign key.

***Brands***:
- Top brands?
- Top products?

***Finance***:
- Most expensive vs. least expensive products?
- Highest sales/discount?
- Highest revenue?

***Info***:
- Most vs. least amount of products?

***Reviews***:
- Best vs. worst rating?
- Most vs. least amount of ratings?

***Traffic***:
- Which hours and/or days is the store the busiest?
- Which products are most vs. least popular at these times?

***Joins***:
1. Join the finance and traffic table to find out **when** the store makes the most money and/or is most busy.
2. Join the finance, brands, and info table to find out **what** products bring in the most money and/or are bought the most.
3. Join the info and reviews table to find out **which** products get the **best** reviews.
    - **Worst** reviews and why?

In [40]:
from sqlalchemy import create_engine
import psycopg2

from urllib.parse import quote_plus
password = quote_plus("Rowan1016!")  # Encodes special characters
DATABASE_URL = f"postgresql://postgres:{password}@localhost:5432/sports_retail"

try:
    engine = create_engine(DATABASE_URL)
    conn = engine.connect()
    print("Connection successful!")
    conn.close()
except Exception as e:
    print("Connection failed:", e)

Connection successful!


In [20]:
%reload_ext sql
%sql $DATABASE_URL

In [21]:
%config SqlMagic.style = '_DEPRECATED_DEFAULT'  # Change to html style or try others

In [42]:
# reference for adding dfs to sql
# to_sql("new_covid19treatments", engine, if_exists='replace', index=False)

# Brands:

Top brands?

# Finance

- Most expensive vs. least expensive products?
- Highest sales/discount?
- Highest revenue?


# Info
- Most vs. least amount of products?


# Reviews

- Best vs. worst rating?
- Most vs. least amount of ratings?