In [150]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [173]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [152]:
from google.cloud import bigquery
from google.oauth2 import service_account

In [153]:
key_path = './service_account/gentle-keyword-423715-j0-03be08ad6412.json'

credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/bigquery"]
)

In [154]:
from google.cloud import bigquery

client = bigquery.Client(
    credentials = credentials,
    project=credentials.project_id
)

In [155]:
query = """
SELECT
  u.id user_id,
  u.country,
  DATE(EXTRACT(YEAR FROM u.created_at), EXTRACT(MONTH FROM u.created_at), 1) created_account_date,
  o.order_id,
  DATE(EXTRACT(YEAR FROM o.created_at), EXTRACT(MONTH FROM o.created_at), 1) order_date,
  oi.sale_price * o.num_of_item revenue,
  p.department,
  p.cost,
  e.session_id,
  e.traffic_source traffic_session,
FROM `bigquery-public-data.thelook_ecommerce.users` u
LEFT JOIN `bigquery-public-data.thelook_ecommerce.order_items` oi
  ON oi.user_id = u.id
LEFT JOIN `bigquery-public-data.thelook_ecommerce.orders` o
  ON u.id = o.user_id AND oi.order_id = o.order_id
INNER JOIN `bigquery-public-data.thelook_ecommerce.products` p
  ON oi.product_id = p.id
LEFT JOIN `bigquery-public-data.thelook_ecommerce.events` e
  ON u.id = e.id
WHERE 
  o.status ='Complete'
"""

df = client.query(query).to_dataframe()
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44943 entries, 0 to 44942
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   user_id               44943 non-null  Int64  
 1   country               44943 non-null  object 
 2   created_account_date  44943 non-null  dbdate 
 3   order_id              44943 non-null  Int64  
 4   order_date            44943 non-null  dbdate 
 5   revenue               44943 non-null  float64
 6   department            44943 non-null  object 
 7   cost                  44943 non-null  float64
 8   session_id            44943 non-null  object 
 9   traffic_session       44943 non-null  object 
dtypes: Int64(2), dbdate(2), float64(2), object(4)
memory usage: 3.5+ MB
None


Unnamed: 0,user_id,country,created_account_date,order_id,order_date,revenue,department,cost,session_id,traffic_session
0,1531,United States,2022-01-01,1907,2024-04-01,10.69,Men,6.54228,cc8bd484-c346-44fe-a8f5-ac477f0457be,Email
1,56227,United States,2020-12-01,70169,2023-10-01,12.99,Women,5.29992,457b5587-330c-4984-8892-776dc4d5770d,Email
2,1062,Brasil,2024-06-01,1325,2024-06-01,3.99,Men,1.53615,fedafd5a-6b57-4f1a-a194-0f74c9830fbd,Email
3,48167,Brasil,2022-05-01,60270,2023-04-01,198.0,Men,56.034,1a4a96e2-402a-4633-ab86-d15188ba181a,Email
4,48167,Brasil,2022-05-01,60270,2023-04-01,199.979996,Men,56.794319,1a4a96e2-402a-4633-ab86-d15188ba181a,Email


In [164]:
df['order_id'] = df['order_id'].astype(str)
df['user_id'] = df['user_id'].astype(str)
df['order_date'] = pd.to_datetime(df['order_date'])
df['created_account_date'] = pd.to_datetime(df['created_account_date'])

print("Number of unique user_id: ", df['user_id'].nunique())

print(df.info())

Number of unique user_id:  27450
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44943 entries, 0 to 44942
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               44943 non-null  object        
 1   country               44943 non-null  object        
 2   created_account_date  44943 non-null  datetime64[ns]
 3   order_id              44943 non-null  object        
 4   order_date            44943 non-null  datetime64[ns]
 5   revenue               44943 non-null  float64       
 6   department            44943 non-null  object        
 7   cost                  44943 non-null  float64       
 8   session_id            44943 non-null  object        
 9   traffic_session       44943 non-null  object        
dtypes: datetime64[ns](2), float64(2), object(6)
memory usage: 3.4+ MB
None


In [157]:
df_wrangled = df.copy()

# Create helper function
def categorized_country(x):
    if x in ['China', 'United States', 'Brasil', 'South Korea', 'France', 'United Kingdom', 'Spain', 'Germany']:
        return x
    else:
        return 'Others'
    
df_wrangled['country'] = df_wrangled['country'].apply(categorized_country)

# Create new field 'age account'
currentDate = df_wrangled['order_date'].max()
df_wrangled['age_account'] = (df_wrangled['order_date'] - df_wrangled['created_account_date']) / pd.to_timedelta(1, 'day')

# Bin the 'age account'
def binned(x):
    if x <= 30:
        return 'less_1month'
    elif x <= 120:
        return 'less_3month'
    elif x <= 180:
        return 'less_6month'
    elif x <= 360:
        return 'less_12month'
    else:
        return 'more_12month'

df_wrangled['age_account'] = df_wrangled['age_account'].apply(binned)

# Encoding
enc_columns = ['country', 'department', 'traffic_session', 'age_account']

for c in enc_columns:
    temp = pd.get_dummies(df_wrangled[c]).astype(int)
    temp.columns = [c + '_' + i.replace(" ", "") for i in temp.columns]
    df_wrangled = pd.concat([df_wrangled, temp], axis=1)

df_wrangled = df_wrangled.drop(['country', 'department', 'traffic_session', 'age_account', 'created_account_date'], axis=1)

print(df_wrangled.info())
df_wrangled.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44943 entries, 0 to 44942
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   user_id                   44943 non-null  object        
 1   order_id                  44943 non-null  object        
 2   order_date                44943 non-null  datetime64[ns]
 3   revenue                   44943 non-null  float64       
 4   cost                      44943 non-null  float64       
 5   session_id                44943 non-null  object        
 6   country_Brasil            44943 non-null  int32         
 7   country_China             44943 non-null  int32         
 8   country_France            44943 non-null  int32         
 9   country_Germany           44943 non-null  int32         
 10  country_Others            44943 non-null  int32         
 11  country_SouthKorea        44943 non-null  int32         
 12  country_Spain     

Unnamed: 0,user_id,order_id,order_date,revenue,cost,session_id,country_Brasil,country_China,country_France,country_Germany,country_Others,country_SouthKorea,country_Spain,country_UnitedKingdom,country_UnitedStates,department_Men,department_Women,traffic_session_Adwords,traffic_session_Email,traffic_session_Facebook,traffic_session_Organic,traffic_session_YouTube,age_account_less_12month,age_account_less_1month,age_account_less_3month,age_account_less_6month,age_account_more_12month
0,1531,1907,2024-04-01,10.69,6.54228,cc8bd484-c346-44fe-a8f5-ac477f0457be,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1
1,56227,70169,2023-10-01,12.99,5.29992,457b5587-330c-4984-8892-776dc4d5770d,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1
2,1062,1325,2024-06-01,3.99,1.53615,fedafd5a-6b57-4f1a-a194-0f74c9830fbd,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0
3,48167,60270,2023-04-01,198.0,56.034,1a4a96e2-402a-4633-ab86-d15188ba181a,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0
4,48167,60270,2023-04-01,199.979996,56.794319,1a4a96e2-402a-4633-ab86-d15188ba181a,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0


In [158]:
map_aggregate = {
    'cost': 'sum',
    'session_id': 'nunique',
    'revenue': 'sum',
    'country_Brasil': 'sum',
    'country_China': 'sum',
    'country_France': 'sum',
    'country_Germany': 'sum',
    'country_Others': 'sum',
    'country_SouthKorea': 'sum',
    'country_Spain': 'sum',
    'country_UnitedKingdom': 'sum',
    'country_UnitedStates': 'sum',
    'department_Men': 'sum',
    'department_Women': 'sum',
    'traffic_session_Adwords': 'sum',
    'traffic_session_Email': 'sum',
    'traffic_session_Facebook': 'sum',
    'traffic_session_Organic': 'sum',
    'traffic_session_YouTube': 'sum',
    'age_account_less_1month': 'sum',
    'age_account_less_3month': 'sum',
    'age_account_less_6month': 'sum',
    'age_account_less_12month': 'sum',
    'age_account_more_12month': 'sum',
}

df_wrangled = df_wrangled.groupby(['order_date']).agg(map_aggregate)

print(df_wrangled.info())
df_wrangled.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 66 entries, 2019-01-01 to 2024-06-01
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   cost                      66 non-null     float64
 1   session_id                66 non-null     int64  
 2   revenue                   66 non-null     float64
 3   country_Brasil            66 non-null     int32  
 4   country_China             66 non-null     int32  
 5   country_France            66 non-null     int32  
 6   country_Germany           66 non-null     int32  
 7   country_Others            66 non-null     int32  
 8   country_SouthKorea        66 non-null     int32  
 9   country_Spain             66 non-null     int32  
 10  country_UnitedKingdom     66 non-null     int32  
 11  country_UnitedStates      66 non-null     int32  
 12  department_Men            66 non-null     int32  
 13  department_Women          66 non-null     int32

Unnamed: 0_level_0,cost,session_id,revenue,country_Brasil,country_China,country_France,country_Germany,country_Others,country_SouthKorea,country_Spain,country_UnitedKingdom,country_UnitedStates,department_Men,department_Women,traffic_session_Adwords,traffic_session_Email,traffic_session_Facebook,traffic_session_Organic,traffic_session_YouTube,age_account_less_1month,age_account_less_3month,age_account_less_6month,age_account_less_12month,age_account_more_12month
order_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2019-01-01,244.3952,4,726.900002,1,3,0,0,0,1,0,0,0,0,5,3,0,2,0,0,5,0,0,0,0
2019-02-01,285.354421,7,858.780005,1,4,1,0,0,0,0,0,3,6,3,5,1,3,0,0,0,9,0,0,0
2019-03-01,423.426619,18,1063.149997,2,7,1,0,0,0,0,4,7,13,8,4,4,6,5,2,13,8,0,0,0
2019-04-01,1339.302591,27,5296.579993,8,18,2,0,1,1,0,1,8,21,18,15,15,8,1,0,3,36,0,0,0
2019-05-01,1463.102206,39,6796.480039,19,21,6,1,2,0,0,4,5,27,31,12,42,1,0,3,23,35,0,0,0


In [159]:
df_wrangled.corr()

Unnamed: 0,cost,session_id,revenue,country_Brasil,country_China,country_France,country_Germany,country_Others,country_SouthKorea,country_Spain,country_UnitedKingdom,country_UnitedStates,department_Men,department_Women,traffic_session_Adwords,traffic_session_Email,traffic_session_Facebook,traffic_session_Organic,traffic_session_YouTube,age_account_less_1month,age_account_less_3month,age_account_less_6month,age_account_less_12month,age_account_more_12month
cost,1.0,0.997982,0.999308,0.994445,0.996562,0.963792,0.974528,0.962536,0.975716,0.975219,0.978713,0.995259,0.998785,0.997926,0.99655,0.997589,0.982982,0.961764,0.990317,0.643455,0.930815,0.925361,0.945104,0.91504
session_id,0.997982,1.0,0.995907,0.993991,0.996251,0.960897,0.974183,0.962239,0.97543,0.978492,0.978415,0.992709,0.997827,0.99723,0.995906,0.996619,0.980974,0.959475,0.991463,0.616594,0.920326,0.920706,0.95013,0.931747
revenue,0.999308,0.995907,1.0,0.993777,0.995933,0.96344,0.974408,0.962463,0.973448,0.974574,0.976931,0.994883,0.998096,0.997282,0.996012,0.996994,0.982288,0.961615,0.988757,0.654193,0.932326,0.923908,0.941608,0.908296
country_Brasil,0.994445,0.993991,0.993777,1.0,0.990355,0.955827,0.971821,0.957612,0.968717,0.971757,0.976323,0.987773,0.994035,0.993731,0.992447,0.993374,0.97876,0.954128,0.985331,0.629946,0.924405,0.925858,0.944992,0.915755
country_China,0.996562,0.996251,0.995933,0.990355,1.0,0.960371,0.97218,0.959014,0.969661,0.975607,0.971844,0.992309,0.997001,0.997191,0.992918,0.99792,0.977504,0.967336,0.989242,0.653958,0.926686,0.914497,0.940938,0.911071
country_France,0.963792,0.960897,0.96344,0.955827,0.960371,1.0,0.916366,0.908044,0.953906,0.9281,0.947451,0.96122,0.960722,0.967346,0.96605,0.959168,0.943774,0.949286,0.957603,0.675566,0.899528,0.870728,0.899986,0.860136
country_Germany,0.974528,0.974183,0.974408,0.971821,0.97218,0.916366,1.0,0.956504,0.932153,0.954304,0.953959,0.965911,0.974193,0.973914,0.975471,0.972582,0.96401,0.932543,0.958825,0.604142,0.905997,0.916645,0.923641,0.904258
country_Others,0.962536,0.962239,0.962463,0.957612,0.959014,0.908044,0.956504,1.0,0.917423,0.958263,0.936634,0.950031,0.960569,0.964473,0.962526,0.964991,0.934031,0.916677,0.953709,0.601122,0.903515,0.906662,0.909768,0.888985
country_SouthKorea,0.975716,0.97543,0.973448,0.968717,0.969661,0.953906,0.932153,0.917423,1.0,0.941691,0.949663,0.971759,0.973801,0.973788,0.976577,0.9669,0.966767,0.931392,0.976029,0.617819,0.878538,0.901809,0.938397,0.903723
country_Spain,0.975219,0.978492,0.974574,0.971757,0.975607,0.9281,0.954304,0.958263,0.941691,1.0,0.956202,0.962498,0.976552,0.974554,0.972899,0.976989,0.960327,0.927583,0.96733,0.558181,0.906747,0.901275,0.943926,0.929727


In [160]:
df_wrangled.corr()['revenue']

cost                        0.999308
session_id                  0.995907
revenue                     1.000000
country_Brasil              0.993777
country_China               0.995933
country_France              0.963440
country_Germany             0.974408
country_Others              0.962463
country_SouthKorea          0.973448
country_Spain               0.974574
country_UnitedKingdom       0.976931
country_UnitedStates        0.994883
department_Men              0.998096
department_Women            0.997282
traffic_session_Adwords     0.996012
traffic_session_Email       0.996994
traffic_session_Facebook    0.982288
traffic_session_Organic     0.961615
traffic_session_YouTube     0.988757
age_account_less_1month     0.654193
age_account_less_3month     0.932326
age_account_less_6month     0.923908
age_account_less_12month    0.941608
age_account_more_12month    0.908296
Name: revenue, dtype: float64

In [161]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso
from sklearn.metrics import mean_squared_error, r2_score


category_dict = {'department': ['department_Men', 'department_Women'],
                 'event_traffic_source': ['traffic_session_Adwords', 'traffic_session_Email', 'traffic_session_Facebook',
                                          'traffic_session_Organic', 'traffic_session_YouTube'],
                 'age_account_category': ['age_account_less_1month', 'age_account_less_3month', 
                                          'age_account_less_6month', 'age_account_less_12month', 
                                          'age_account_more_12month']
                }


y = df_wrangled['revenue']
result = pd.DataFrame()
for c in category_dict.keys():
    X = df_wrangled[category_dict[c]]
    model = LinearRegression()
    model.fit(X, y)
    temp = pd.DataFrame(model.coef_, index=X.columns, columns=[c]).T
    result = pd.concat([temp, result])
    
    print(f"RMSE {c}", mean_squared_error(model.predict(X), y)**0.5)
    print(f"R2 Score {c}", r2_score(model.predict(X), y))

result

RMSE department 3965.0514003645226
R2 Score department 0.9972682074495577
RMSE event_traffic_source 3814.53659132423
R2 Score event_traffic_source 0.9974721849818041
RMSE age_account_category 3886.2130246444517
R2 Score age_account_category 0.9973760438599347


Unnamed: 0,age_account_less_1month,age_account_less_3month,age_account_less_6month,age_account_less_12month,age_account_more_12month,traffic_session_Adwords,traffic_session_Email,traffic_session_Facebook,traffic_session_Organic,traffic_session_YouTube,department_Men,department_Women
age_account_category,111.369732,124.807616,125.235368,129.934953,106.022428,,,,,,,
event_traffic_source,,,,,,126.975929,126.663415,146.188453,8.773832,54.900217,,
department,,,,,,,,,,,141.277878,88.701125


In [162]:
# result.to_csv('./coefficient-result.csv')