# 4.9 Intro to Data Visualization with Python - Part 1

### This script contains the following points:
1. Importing libraries and dataframes
2. Data wrangling of 'Customers' dataframe
3. Combining 'customers' and 'ords_prods' dataframes
4. Exporting Dataframe

### The following dataframes were manipulated/created:
   a) 'customers' dataframe: columns 'First Name' and 'Surnam' deleted; several columns renamed for simplicity;
    b). 'ords_prods_all' created by merge of 'customers' and 'ords_prods_analysis_2'



#


## 1. Importing libraries and dataframes

In [1]:
# Importing libraries and dataframe
import pandas as pd
import numpy as np
import os

In [2]:
path = r'C:\Users\bruna\Career Foundry\08-2023 Instacart Basket Analysis'

customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))
ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_analysis_2.pkl'))

## 2. Data wrangling of 'Customers' dataframe

In [3]:

# 1. Printing head/tail to have an overview of the dataframe
customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [4]:

# 2. Runing descriptive analysis of dataframe
customers.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [5]:
# results: all values seems to make sense - no indication of missing values or unexpected min, max, etc

In [6]:

# 3. Confirming there are no duplicated values in user_id (expected nr of unique values: 206209)

unique_values = customers['user_id'].unique()
print ('Number of unique user_id:', len(unique_values))

Number of unique user_id: 206209


In [7]:

# 4. Dropping 'First Name' and 'Surnam' columns (not required for analysis)
customers = customers.drop(columns = ['First Name', 'Surnam'])

In [8]:

# 5. Renaming columns
customers.rename (columns = {'Gender':'gender', 'STATE': 'state', 'Age': 'age', 'n_dependants': 'nr_dependants'}, inplace = True)

In [9]:

# 5. Checking count and data types
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   user_id        206209 non-null  int64 
 1   gender         206209 non-null  object
 2   state          206209 non-null  object
 3   age            206209 non-null  int64 
 4   date_joined    206209 non-null  object
 5   nr_dependants  206209 non-null  int64 
 6   fam_status     206209 non-null  object
 7   income         206209 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 12.6+ MB


In [10]:
# results: there are no missing values apart from first_name column
# results: data joined will need to be changed to date. All other data types are correct

In [11]:

# 6. Changing column 'date_joined' data type
customers['date_joined'] = customers['date_joined'].astype('datetime64[D]')

In [12]:
customers['date_joined'].dtype

dtype('<M8[ns]')

In [13]:

# 7. Checking unique values for gender (expected Male/Female)
unique_values = customers['gender'].unique()
print (unique_values, 'Number of unique genders:', len(unique_values))

['Female' 'Male'] Number of unique genders: 2


In [14]:

# 8. Checking unique values for state (expected 51)
unique_values = sorted(customers['state'].unique())
print (unique_values, 'Number of unique states:', len(unique_values))

['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'] Number of unique states: 51


In [15]:

# 9. Checking unique values for fam_status
unique_values = customers['fam_status'].unique()
print (unique_values, 'Number of unique fam_status:', len(unique_values))

['married' 'single' 'living with parents and siblings' 'divorced/widowed'] Number of unique fam_status: 4


In [16]:

# 10. Identifying columns with missing values
customers.isnull().sum()

user_id          0
gender           0
state            0
age              0
date_joined      0
nr_dependants    0
fam_status       0
income           0
dtype: int64

In [17]:
# results: there are no missing values in the dataframe

In [18]:

# 11. Checking for mixed-type data
for col in customers.columns.tolist():
    weird = (customers[[col]].applymap(type) != customers[[col]].iloc[0].apply(type)).any(axis=1)
    if len (customers[weird]) > 0:
        print(col)

In [19]:
# results: there are no columns with mixed-type data

## 3. Combining 'customers' and 'ords_prods' dataframes

In [20]:

# 1. Double checking compatibility of columns 'user_id' (expected data type: int64)
ords_prods.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 26 columns):
 #   Column                         Dtype   
---  ------                         -----   
 0   order_id                       int64   
 1   user_id                        int64   
 2   order_number                   int64   
 3   orders_day_of_week             int64   
 4   time_of_order                  int64   
 5   days_since_prior_order         float64 
 6   first_order_flag               bool    
 7   product_id                     int64   
 8   add_to_cart_order              int64   
 9   reordered                      int64   
 10  _merge                         category
 11  product_name                   object  
 12  aisle_id                       int64   
 13  department_id                  int64   
 14  prices                         float64 
 15  exists                         category
 16  price_range_loc                object  
 17  busiest_day              

In [21]:
#  results: ords_prods and customers have column 'user_id' in common (both int64) --> This can be used to merge the two dataframes

In [32]:
# 2. Dropping column '_merge' and 'exists' (indicators from previous merges, not needed)
ords_prods = ords_prods.drop (columns = ['_merge'],['exists'])


In [31]:
ords_prods.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 25 columns):
 #   Column                         Dtype   
---  ------                         -----   
 0   order_id                       int64   
 1   user_id                        int64   
 2   order_number                   int64   
 3   orders_day_of_week             int64   
 4   time_of_order                  int64   
 5   days_since_prior_order         float64 
 6   first_order_flag               bool    
 7   product_id                     int64   
 8   add_to_cart_order              int64   
 9   reordered                      int64   
 10  product_name                   object  
 11  aisle_id                       int64   
 12  department_id                  int64   
 13  prices                         float64 
 14  exists                         category
 15  price_range_loc                object  
 16  busiest_day                    object  
 17  busiest_days_grouped     

In [24]:

# 2. Merging ords_prods with customers dataframe

ords_prods_all = ords_prods.merge(customers, on = 'user_id', indicator = True)

In [34]:
#running frequency check of _merge column
ords_prods_all['_merge'].value_counts()

both          32404859
left_only            0
right_only           0
Name: _merge, dtype: int64

In [39]:
#allowing for all columns to be seen
pd.set_option('display.max_columns', None) 

In [40]:
#checking output of new dataframe
ords_prods_all.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,time_of_order,days_since_prior_order,first_order_flag,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,exists,price_range_loc,busiest_day,busiest_days_grouped,busiest_period_of_day,max_order,loyalty_flag,average_price,spending_flag,median_days_since_prior_order,order_frequency_flag,gender,state,age,date_joined,nr_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,both,Mid range product,Regularly busy,Regularly busy days,Average orders,10,New customer,4.7,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,both,Mid range product,Regularly busy,Slowest days,Average orders,10,New customer,4.7,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,both,Mid range product,Regularly busy,Slowest days,Most orders,10,New customer,4.7,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,77,7,9.0,both,Mid range product,Least busy,Slowest days,Average orders,10,New customer,4.7,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,77,7,9.0,both,Mid range product,Least busy,Slowest days,Most orders,10,New customer,4.7,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both


In [43]:
#reseting settings to default
pd.reset_option('display.max_columns')

## 4. Exporting Dataframe

In [44]:
# Exporting 'ords_prods_all

ords_prods_all.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_all.pkl'))