# Exploratory Data Analysis - AGM's Customers


In [1]:
# Import necessary packages
import math
import numpy as np
import pandas as pd
import psycopg2
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    '''This function will run a selected query in Postgres and 
    return the rows in a pandas dataframe'''
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # Below converts float columns that integer columns, for those that should be integer columns.
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)

connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

cursor = connection.cursor()

### Total Number of Customers for all of AGM

In [3]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select count(customer_id) as total_customers 
from sales
"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,total_customers
0,1537617


### Total Number of Customers, by Store

In [4]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select stores.city, count(sales.customer_id) as total_customers
from customers
join sales
     on sales.customer_id = customers.customer_id
join stores
     on stores.store_id = sales.store_id
group by stores.city 
order by stores.city asc

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,total_customers
0,Berkeley,390375
1,Dallas,302120
2,Miami,275074
3,Nashville,227721
4,Seattle,342327


### Total Number of Customers, by Distance from Store

In [5]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select customers.distance, count(sales.customer_id) as total_customers
from customers
join sales
     on sales.customer_id = customers.customer_id
group by customers.distance
order by customers.distance asc

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,distance,total_customers
0,1,199348
1,2,124285
2,3,174213
3,4,188139
4,5,132048
5,6,109472
6,7,93550
7,8,102018
8,9,74303
9,10,64060


### List of Registered Customers Who Have Not Made a Purchase

In [6]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select customers.last_name, customers.first_name
from customers
    left join sales
              on customers.customer_id = sales.customer_id
where sales.customer_id IS NULL

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,last_name,first_name
0,Goodband,Marleah
1,Matterface,Petrina
2,Borman,Felice
3,Scheu,Candra
4,Ellaway,Lorianna
5,Butterick,Jacenta
6,Agott,Tracy
7,Gerty,Jae
8,Moggie,Walden
9,Keddie,Marsh


### Percentage of Customers Per Population, by Zip Code

In [10]:
rollback_before_flag = True
rollback_after_flag = True

query = """
select customers.zip, 
count(*) * 100/ sum(count(*)) over () as zip_percent_customers
from customers
     left join sales
               on customers.customer_id = sales.customer_id
group by customers.zip
order by zip_percent_customers desc
"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,zip,zip_percent_customers
0,33134,1.424380
1,37206,1.231878
2,37212,1.225830
3,98119,1.196304
4,37210,1.078918
...,...,...
545,94565,0.001561
546,75019,0.001561
547,75125,0.001496
548,76018,0.001431


### Percentage of Customers Per population, by City

In [12]:
#Assumptions: cities of the stores and customers who actually bought something
rollback_before_flag = True
rollback_after_flag = True

query = """
select stores.city, 
count(*) * 100/ sum(count(*)) over () as city_percent
from stores
     left join sales
               on stores.store_id = sales.store_id
group by stores.city
order by city_percent desc
"""

df_2 = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df_2

Unnamed: 0,city,city_percent
0,Berkeley,25.388312
1,Seattle,22.263477
2,Dallas,19.648586
3,Miami,17.889631
4,Nashville,14.809995
