In [7]:
import sqlite3
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

from itertools import product
from scipy.stats import skewnorm

from datetime import datetime
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder

In [8]:
dados = pd.read_csv("/Users/catarinasantos/Documents/DM_AIAI_CustomerDB.csv", sep=",")
dados.head()

Unnamed: 0.1,Unnamed: 0,Loyalty#,First Name,Last Name,Customer Name,Country,Province or State,City,Latitude,Longitude,...,Gender,Education,Location Code,Income,Marital Status,LoyaltyStatus,EnrollmentDateOpening,CancellationDate,Customer Lifetime Value,EnrollmentType
0,0,480934,Cecilia,Householder,Cecilia Householder,Canada,Ontario,Toronto,43.653225,-79.383186,...,female,Bachelor,Urban,70146.0,Married,Star,2/15/2019,,3839.14,Standard
1,1,549612,Dayle,Menez,Dayle Menez,Canada,Alberta,Edmonton,53.544388,-113.49093,...,male,College,Rural,0.0,Divorced,Star,3/9/2019,,3839.61,Standard
2,2,429460,Necole,Hannon,Necole Hannon,Canada,British Columbia,Vancouver,49.28273,-123.12074,...,male,College,Urban,0.0,Single,Star,7/14/2017,1/8/2021,3839.75,Standard
3,3,608370,Queen,Hagee,Queen Hagee,Canada,Ontario,Toronto,43.653225,-79.383186,...,male,College,Suburban,0.0,Single,Star,2/17/2016,,3839.75,Standard
4,4,530508,Claire,Latting,Claire Latting,Canada,Quebec,Hull,45.42873,-75.713364,...,male,Bachelor,Suburban,97832.0,Married,Star,10/25/2017,,3842.79,2021 Promotion


In [9]:
print(type(dados))
dados.dtypes

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0                   int64
Loyalty#                     int64
First Name                  object
Last Name                   object
Customer Name               object
Country                     object
Province or State           object
City                        object
Latitude                   float64
Longitude                  float64
Postal code                 object
Gender                      object
Education                   object
Location Code               object
Income                     float64
Marital Status              object
LoyaltyStatus               object
EnrollmentDateOpening       object
CancellationDate            object
Customer Lifetime Value    float64
EnrollmentType              object
dtype: object

In [10]:
dados.describe().round(2)

Unnamed: 0.1,Unnamed: 0,Loyalty#,Latitude,Longitude,Income,Customer Lifetime Value
count,16921.0,16921.0,16921.0,16921.0,16901.0,16901.0
mean,8440.02,550197.39,47.17,-91.81,37758.04,7990.46
std,4884.78,259251.5,3.31,22.24,30368.99,6863.17
min,0.0,100011.0,42.98,-135.06,0.0,1898.01
25%,4210.0,326823.0,44.23,-120.24,0.0,3979.72
50%,8440.0,550896.0,46.09,-79.38,34161.0,5780.18
75%,12670.0,772438.0,49.28,-74.6,62396.0,8945.69
max,16900.0,999999.0,60.72,-52.71,99981.0,83325.38


In [11]:
print(dados.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16921 entries, 0 to 16920
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               16921 non-null  int64  
 1   Loyalty#                 16921 non-null  int64  
 2   First Name               16921 non-null  object 
 3   Last Name                16921 non-null  object 
 4   Customer Name            16921 non-null  object 
 5   Country                  16921 non-null  object 
 6   Province or State        16921 non-null  object 
 7   City                     16921 non-null  object 
 8   Latitude                 16921 non-null  float64
 9   Longitude                16921 non-null  float64
 10  Postal code              16921 non-null  object 
 11  Gender                   16921 non-null  object 
 12  Education                16921 non-null  object 
 13  Location Code            16921 non-null  object 
 14  Income                

In [12]:
# Total null values
print(dados.isnull().sum())

# to see if we have any null value in the data
print(dados.isnull().any().any())  # True if any null exists

Unnamed: 0                     0
Loyalty#                       0
First Name                     0
Last Name                      0
Customer Name                  0
Country                        0
Province or State              0
City                           0
Latitude                       0
Longitude                      0
Postal code                    0
Gender                         0
Education                      0
Location Code                  0
Income                        20
Marital Status                 0
LoyaltyStatus                  0
EnrollmentDateOpening          0
CancellationDate           14611
Customer Lifetime Value       20
EnrollmentType                 0
dtype: int64
True


In [13]:
# Total number of rows
total_rows = len(dados)

# Columns to check
columns_to_check = ['Income', 'Customer Lifetime Value']

# Count how many rows have NaN in any of these columns
rows_with_nulls = dados[columns_to_check].isnull().any(axis=1).sum()

# Percentage of rows that would be removed
percent_rows = rows_with_nulls / total_rows * 100
print(f"Rows to remove: {rows_with_nulls} ({percent_rows:.2f}%)")

# Drop rows only if less than 5% of data
if percent_rows < 5:
    dados = dados.dropna(subset=columns_to_check)
    print("Rows removed.")
else:
    print("Not removing rows, they represent more than 5% of total.")

Rows to remove: 20 (0.12%)
Rows removed.


In [14]:
numeric_columns = ['Latitude', 'Longitude', 'Income', 'Customer Lifetime Value', 'Loyalty#']
print(dados[numeric_columns].mean())      
print(dados[numeric_columns].median())    
print(dados[numeric_columns].std())       
print(dados[numeric_columns].var())       

Latitude                       47.173742
Longitude                     -91.824638
Income                      37758.038400
Customer Lifetime Value      7990.460188
Loyalty#                   550037.873084
dtype: float64
Latitude                       46.087818
Longitude                     -79.383186
Income                      34161.000000
Customer Lifetime Value      5780.180000
Loyalty#                   550834.000000
dtype: float64
Latitude                        3.306686
Longitude                      22.242789
Income                      30368.992499
Customer Lifetime Value      6863.173093
Loyalty#                   258942.628471
dtype: float64
Latitude                   1.093417e+01
Longitude                  4.947417e+02
Income                     9.222757e+08
Customer Lifetime Value    4.710314e+07
Loyalty#                   6.705128e+10
dtype: float64


In [15]:
numeric = ['Loyalty#', 'Latitude', 'Longitude', 'Income', 'Customer Lifetime Value']

correlation_matrix = dados[numeric].corr()
print(correlation_matrix)

                         Loyalty#  Latitude  Longitude    Income  \
Loyalty#                 1.000000  0.011855  -0.004904 -0.007866   
Latitude                 0.011855  1.000000  -0.764874 -0.002009   
Longitude               -0.004904 -0.764874   1.000000  0.002928   
Income                  -0.007866 -0.002009   0.002928  1.000000   
Customer Lifetime Value -0.002533 -0.002207   0.006218  0.024026   

                         Customer Lifetime Value  
Loyalty#                               -0.002533  
Latitude                               -0.002207  
Longitude                               0.006218  
Income                                  0.024026  
Customer Lifetime Value                 1.000000  


In [16]:
categorical_cols = ['Gender', 'Marital Status', 'Education', 'LoyaltyStatus', 'Country']
for col in categorical_cols:
    print(dados[col].value_counts())

Gender
female    8486
male      8415
Name: count, dtype: int64
Marital Status
Married     9830
Single      4531
Divorced    2540
Name: count, dtype: int64
Education
Bachelor                10578
College                  4273
High School or Below      792
Doctor                    742
Master                    516
Name: count, dtype: int64
LoyaltyStatus
Star      7741
Nova      5722
Aurora    3438
Name: count, dtype: int64
Country
Canada    16901
Name: count, dtype: int64


In [17]:
for col in categorical_cols:
    print('mode for', col, ':',dados[col].mode())

mode for Gender : 0    female
Name: Gender, dtype: object
mode for Marital Status : 0    Married
Name: Marital Status, dtype: object
mode for Education : 0    Bachelor
Name: Education, dtype: object
mode for LoyaltyStatus : 0    Star
Name: LoyaltyStatus, dtype: object
mode for Country : 0    Canada
Name: Country, dtype: object


In [18]:
dados[categorical_cols]

Unnamed: 0,Gender,Marital Status,Education,LoyaltyStatus,Country
0,female,Married,Bachelor,Star,Canada
1,male,Divorced,College,Star,Canada
2,male,Single,College,Star,Canada
3,male,Single,College,Star,Canada
4,male,Married,Bachelor,Star,Canada
...,...,...,...,...,...
16896,female,Married,College,Star,Canada
16897,female,Married,Bachelor,Star,Canada
16898,male,Single,College,Star,Canada
16899,male,Married,Bachelor,Star,Canada


In [19]:
print(dados.duplicated().sum()) #to verify that the dataset contains no duplicate rows

0
