In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [23]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [24]:
from google.cloud import bigquery
from google.oauth2 import service_account

In [25]:
key_path = './service_account/gentle-keyword-423715-j0-03be08ad6412.json'

credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/bigquery"]
)

In [26]:
from google.cloud import bigquery

client = bigquery.Client(
    credentials = credentials,
    project=credentials.project_id
)

In [27]:
query = """
SELECT
  u.id user_id,
  u.country,
  DATE(EXTRACT(YEAR FROM u.created_at), EXTRACT(MONTH FROM u.created_at), 1) created_account_date,
  o.order_id,
  DATE(EXTRACT(YEAR FROM o.created_at), EXTRACT(MONTH FROM o.created_at), 1) order_date,
  oi.sale_price * o.num_of_item revenue,
  p.department,
  p.cost,
  e.session_id,
  e.traffic_source traffic_session,
FROM `bigquery-public-data.thelook_ecommerce.users` u
LEFT JOIN `bigquery-public-data.thelook_ecommerce.order_items` oi
  ON oi.user_id = u.id
LEFT JOIN `bigquery-public-data.thelook_ecommerce.orders` o
  ON u.id = o.user_id AND oi.order_id = o.order_id
INNER JOIN `bigquery-public-data.thelook_ecommerce.products` p
  ON oi.product_id = p.id
LEFT JOIN `bigquery-public-data.thelook_ecommerce.events` e
  ON u.id = e.id
WHERE 
  o.status ='Complete'
"""

df = client.query(query).to_dataframe()
print(df.info())
df.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45421 entries, 0 to 45420
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   user_id               45421 non-null  Int64  
 1   country               45421 non-null  object 
 2   created_account_date  45421 non-null  dbdate 
 3   order_id              45421 non-null  Int64  
 4   order_date            45421 non-null  dbdate 
 5   revenue               45421 non-null  float64
 6   department            45421 non-null  object 
 7   cost                  45421 non-null  float64
 8   session_id            45421 non-null  object 
 9   traffic_session       45421 non-null  object 
dtypes: Int64(2), dbdate(2), float64(2), object(4)
memory usage: 3.6+ MB
None


Unnamed: 0,user_id,country,created_account_date,order_id,order_date,revenue,department,cost,session_id,traffic_session
0,16543,Spain,2022-08-01,20880,2023-12-01,24.0,Men,11.712,79670672-1534-4ecd-9e3f-f7162f32ca65,Adwords
1,41364,United Kingdom,2022-08-01,52001,2024-05-01,28.5,Men,18.126,599cbcfe-d349-4965-8050-bab6843fca9f,Adwords
2,99112,Brasil,2022-08-01,123931,2023-10-01,32.779999,Men,14.062619,204893ac-01af-41e4-8f7c-607d852358ff,Email
3,78837,France,2022-08-01,98707,2023-03-01,39.0,Men,16.185,1d9e8ef4-c968-449d-9568-a4c73ac1e240,Adwords
4,99112,Brasil,2022-08-01,123930,2023-12-01,150.0,Men,23.3,204893ac-01af-41e4-8f7c-607d852358ff,Email


In [28]:
df['order_id'] = df['order_id'].astype(str)
df['user_id'] = df['user_id'].astype(str)
df['order_date'] = pd.to_datetime(df['order_date'])
df['created_account_date'] = pd.to_datetime(df['created_account_date'])

print("Number of unique user_id: ", df['user_id'].nunique())

print(df.info())

Number of unique user_id:  27551
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45421 entries, 0 to 45420
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               45421 non-null  object        
 1   country               45421 non-null  object        
 2   created_account_date  45421 non-null  datetime64[ns]
 3   order_id              45421 non-null  object        
 4   order_date            45421 non-null  datetime64[ns]
 5   revenue               45421 non-null  float64       
 6   department            45421 non-null  object        
 7   cost                  45421 non-null  float64       
 8   session_id            45421 non-null  object        
 9   traffic_session       45421 non-null  object        
dtypes: datetime64[ns](2), float64(2), object(6)
memory usage: 3.5+ MB
None


In [29]:
df_wrangled = df.copy()

# Create helper function
def categorized_country(x):
    if x in ['China', 'United States', 'Brasil', 'South Korea', 'France', 'United Kingdom', 'Spain', 'Germany']:
        return x
    else:
        return 'Others'
    
df_wrangled['country'] = df_wrangled['country'].apply(categorized_country)

# Create new field 'age account'
currentDate = df_wrangled['order_date'].max()
df_wrangled['age_account'] = (df_wrangled['order_date'] - df_wrangled['created_account_date']) / pd.to_timedelta(1, 'day')

# Bin the 'age account'
def binned(x):
    if x <= 30:
        return 'less_1month'
    elif x <= 120:
        return 'less_3month'
    elif x <= 180:
        return 'less_6month'
    elif x <= 360:
        return 'less_12month'
    else:
        return 'more_12month'

df_wrangled['age_account'] = df_wrangled['age_account'].apply(binned)

# Encoding
df_enc = df_wrangled.copy()
enc_columns = ['country', 'department', 'traffic_session', 'age_account']

for c in enc_columns:
    temp = pd.get_dummies(df_enc[c]).astype(int)
    temp.columns = [c + '_' + i.replace(" ", "") for i in temp.columns]
    df_enc = pd.concat([df_enc, temp], axis=1)

df_enc = df_enc.drop(['country', 'department', 'traffic_session', 'age_account', 'created_account_date'], axis=1)

print(df_enc.info())
df_enc.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45421 entries, 0 to 45420
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   user_id                   45421 non-null  object        
 1   order_id                  45421 non-null  object        
 2   order_date                45421 non-null  datetime64[ns]
 3   revenue                   45421 non-null  float64       
 4   cost                      45421 non-null  float64       
 5   session_id                45421 non-null  object        
 6   country_Brasil            45421 non-null  int32         
 7   country_China             45421 non-null  int32         
 8   country_France            45421 non-null  int32         
 9   country_Germany           45421 non-null  int32         
 10  country_Others            45421 non-null  int32         
 11  country_SouthKorea        45421 non-null  int32         
 12  country_Spain     

Unnamed: 0,user_id,order_id,order_date,revenue,cost,session_id,country_Brasil,country_China,country_France,country_Germany,country_Others,country_SouthKorea,country_Spain,country_UnitedKingdom,country_UnitedStates,department_Men,department_Women,traffic_session_Adwords,traffic_session_Email,traffic_session_Facebook,traffic_session_Organic,traffic_session_YouTube,age_account_less_12month,age_account_less_1month,age_account_less_3month,age_account_less_6month,age_account_more_12month
0,16543,20880,2023-12-01,24.0,11.712,79670672-1534-4ecd-9e3f-f7162f32ca65,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1
1,41364,52001,2024-05-01,28.5,18.126,599cbcfe-d349-4965-8050-bab6843fca9f,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1
2,99112,123931,2023-10-01,32.779999,14.062619,204893ac-01af-41e4-8f7c-607d852358ff,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1
3,78837,98707,2023-03-01,39.0,16.185,1d9e8ef4-c968-449d-9568-a4c73ac1e240,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0
4,99112,123930,2023-12-01,150.0,23.3,204893ac-01af-41e4-8f7c-607d852358ff,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1


In [30]:
map_aggregate = {
    'cost': 'sum',
    'session_id': 'nunique',
    'revenue': 'sum',
    'country_Brasil': 'sum',
    'country_China': 'sum',
    'country_France': 'sum',
    'country_Germany': 'sum',
    'country_Others': 'sum',
    'country_SouthKorea': 'sum',
    'country_Spain': 'sum',
    'country_UnitedKingdom': 'sum',
    'country_UnitedStates': 'sum',
    'department_Men': 'sum',
    'department_Women': 'sum',
    'traffic_session_Adwords': 'sum',
    'traffic_session_Email': 'sum',
    'traffic_session_Facebook': 'sum',
    'traffic_session_Organic': 'sum',
    'traffic_session_YouTube': 'sum',
    'age_account_less_1month': 'sum',
    'age_account_less_3month': 'sum',
    'age_account_less_6month': 'sum',
    'age_account_less_12month': 'sum',
    'age_account_more_12month': 'sum',
}

df_enc = df_enc.groupby(['order_date']).agg(map_aggregate)

print(df_enc.info())
df_enc.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 66 entries, 2019-01-01 to 2024-06-01
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   cost                      66 non-null     float64
 1   session_id                66 non-null     int64  
 2   revenue                   66 non-null     float64
 3   country_Brasil            66 non-null     int32  
 4   country_China             66 non-null     int32  
 5   country_France            66 non-null     int32  
 6   country_Germany           66 non-null     int32  
 7   country_Others            66 non-null     int32  
 8   country_SouthKorea        66 non-null     int32  
 9   country_Spain             66 non-null     int32  
 10  country_UnitedKingdom     66 non-null     int32  
 11  country_UnitedStates      66 non-null     int32  
 12  department_Men            66 non-null     int32  
 13  department_Women          66 non-null     int32

Unnamed: 0_level_0,cost,session_id,revenue,country_Brasil,country_China,country_France,country_Germany,country_Others,country_SouthKorea,country_Spain,country_UnitedKingdom,country_UnitedStates,department_Men,department_Women,traffic_session_Adwords,traffic_session_Email,traffic_session_Facebook,traffic_session_Organic,traffic_session_YouTube,age_account_less_1month,age_account_less_3month,age_account_less_6month,age_account_less_12month,age_account_more_12month
order_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2019-01-01,102.23512,4,313.39,2,2,0,0,0,0,0,0,1,3,2,1,4,0,0,0,5,0,0,0,0
2019-02-01,411.855242,12,1233.690004,2,8,0,0,1,1,0,0,3,6,9,10,3,0,0,2,6,9,0,0,0
2019-03-01,789.539874,17,3100.350023,1,5,4,1,1,4,1,0,6,18,5,5,11,4,0,3,18,5,0,0,0
2019-04-01,1203.526666,35,4731.99999,8,21,4,1,1,1,1,2,9,17,31,12,22,5,1,8,7,41,0,0,0
2019-05-01,1547.610635,39,4933.880027,4,27,1,0,3,1,1,1,13,27,24,13,27,3,5,3,19,32,0,0,0


In [31]:
df_enc.corr()

Unnamed: 0,cost,session_id,revenue,country_Brasil,country_China,country_France,country_Germany,country_Others,country_SouthKorea,country_Spain,country_UnitedKingdom,country_UnitedStates,department_Men,department_Women,traffic_session_Adwords,traffic_session_Email,traffic_session_Facebook,traffic_session_Organic,traffic_session_YouTube,age_account_less_1month,age_account_less_3month,age_account_less_6month,age_account_less_12month,age_account_more_12month
cost,1.0,0.996786,0.999467,0.993758,0.996762,0.977825,0.978339,0.985213,0.985902,0.98062,0.979081,0.993041,0.998173,0.997929,0.994539,0.997886,0.989889,0.972592,0.987712,0.776089,0.928139,0.905774,0.941043,0.902109
session_id,0.996786,1.0,0.99552,0.991587,0.99608,0.979115,0.974141,0.986705,0.984292,0.97784,0.976023,0.989477,0.99658,0.996084,0.993795,0.996091,0.98744,0.976437,0.982101,0.73554,0.904378,0.911551,0.957908,0.929028
revenue,0.999467,0.99552,1.0,0.992525,0.996168,0.977984,0.976871,0.98554,0.985614,0.980797,0.978201,0.991695,0.997346,0.997222,0.993223,0.997732,0.988279,0.972412,0.986409,0.77411,0.930476,0.906689,0.941308,0.900475
country_Brasil,0.993758,0.991587,0.992525,1.0,0.98922,0.96558,0.972493,0.974186,0.974794,0.972262,0.974222,0.984477,0.993718,0.989835,0.989448,0.991491,0.983767,0.961082,0.981192,0.762953,0.91949,0.884853,0.940285,0.903971
country_China,0.996762,0.99608,0.996168,0.98922,1.0,0.973312,0.972934,0.984776,0.986022,0.975582,0.975574,0.986878,0.996522,0.995951,0.992778,0.99606,0.987904,0.974866,0.98426,0.760324,0.920075,0.912008,0.946761,0.909123
country_France,0.977825,0.979115,0.977984,0.96558,0.973312,1.0,0.949851,0.971708,0.958206,0.965525,0.948761,0.968187,0.975984,0.975816,0.969964,0.97779,0.963574,0.955029,0.966109,0.729047,0.88758,0.9133,0.940149,0.899783
country_Germany,0.978339,0.974141,0.976871,0.972493,0.972934,0.949851,1.0,0.948859,0.969472,0.951358,0.949894,0.980869,0.980816,0.975009,0.978251,0.9761,0.975144,0.948399,0.961903,0.788062,0.920161,0.867083,0.905655,0.869333
country_Others,0.985213,0.986705,0.98554,0.974186,0.984776,0.971708,0.948859,1.0,0.970549,0.970316,0.964141,0.973687,0.98194,0.986438,0.979624,0.983776,0.976949,0.976834,0.969518,0.731281,0.903775,0.923167,0.943485,0.907353
country_SouthKorea,0.985902,0.984292,0.985614,0.974794,0.986022,0.958206,0.969472,0.970549,1.0,0.96193,0.966229,0.979423,0.985006,0.986322,0.979308,0.98509,0.983031,0.962712,0.978762,0.768999,0.90849,0.903573,0.925638,0.892337
country_Spain,0.98062,0.97784,0.980797,0.972262,0.975582,0.965525,0.951358,0.970316,0.96193,1.0,0.961973,0.973113,0.978342,0.980297,0.97878,0.976816,0.972595,0.949376,0.972179,0.766042,0.908263,0.890324,0.923359,0.883033


In [32]:
df_enc.corr()['revenue']

cost                        0.999467
session_id                  0.995520
revenue                     1.000000
country_Brasil              0.992525
country_China               0.996168
country_France              0.977984
country_Germany             0.976871
country_Others              0.985540
country_SouthKorea          0.985614
country_Spain               0.980797
country_UnitedKingdom       0.978201
country_UnitedStates        0.991695
department_Men              0.997346
department_Women            0.997222
traffic_session_Adwords     0.993223
traffic_session_Email       0.997732
traffic_session_Facebook    0.988279
traffic_session_Organic     0.972412
traffic_session_YouTube     0.986409
age_account_less_1month     0.774110
age_account_less_3month     0.930476
age_account_less_6month     0.906689
age_account_less_12month    0.941308
age_account_more_12month    0.900475
Name: revenue, dtype: float64

In [33]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso
from sklearn.metrics import mean_squared_error, r2_score


category_dict = {'department': ['department_Men', 'department_Women'],
                 'event_traffic_source': ['traffic_session_Adwords', 'traffic_session_Email', 'traffic_session_Facebook',
                                          'traffic_session_Organic', 'traffic_session_YouTube'],
                 'age_account_category': ['age_account_less_1month', 'age_account_less_3month', 
                                          'age_account_less_6month', 'age_account_less_12month', 
                                          'age_account_more_12month']
                }


y = df_enc['revenue']
result = pd.DataFrame()
for c in category_dict.keys():
    X = df_enc[category_dict[c]]
    model = LinearRegression()
    model.fit(X, y)
    temp = pd.DataFrame(model.coef_, index=X.columns, columns=[c]).T
    result = pd.concat([temp, result])
    
    print(f"RMSE {c}", mean_squared_error(model.predict(X), y)**0.5)
    print(f"R2 Score {c}", r2_score(model.predict(X), y))

result

RMSE department 4346.323033531606
R2 Score department 0.9972622239738125
RMSE event_traffic_source 4087.6256658115203
R2 Score event_traffic_source 0.997579200387568
RMSE age_account_category 3964.4370504683084
R2 Score age_account_category 0.9977232400150035


Unnamed: 0,age_account_less_1month,age_account_less_3month,age_account_less_6month,age_account_less_12month,age_account_more_12month,traffic_session_Adwords,traffic_session_Email,traffic_session_Facebook,traffic_session_Organic,traffic_session_YouTube,department_Men,department_Women
age_account_category,99.766858,145.575739,134.435033,115.127323,105.238631,,,,,,,
event_traffic_source,,,,,,73.489048,148.521241,63.451923,90.231745,136.082993,,
department,,,,,,,,,,,117.130185,111.403626


In [None]:
# result.to_csv('./result/coefficient-result.csv')