### Relax Data Challenge

In [329]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [330]:
df_users = pd.read_csv("takehome_users.csv", encoding = "ISO-8859-1")

In [331]:
df_users

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1.398139e+09,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1.396238e+09,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1.363735e+09,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1.369210e+09,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1.358850e+09,0,0,193,5240.0
...,...,...,...,...,...,...,...,...,...,...
11995,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,1.378448e+09,0,0,89,8263.0
11996,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,1.358275e+09,0,0,200,
11997,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,1.398603e+09,1,1,83,8074.0
11998,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,1.338638e+09,0,0,6,


In [332]:
df_engage = pd.read_csv("takehome_user_engagement.csv")

In [333]:
df_engage

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
...,...,...,...
207912,2013-09-06 06:14:15,11996,1
207913,2013-01-15 18:28:37,11997,1
207914,2014-04-27 12:45:16,11998,1
207915,2012-06-02 11:55:59,11999,1


In [334]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [335]:
df_engage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


No missing values in the engagement dataset. However in the users dataframe looks like the last_session creation time has a wrong format and has some missing values along with invited by user id column.\
First let's convert the time_stamp column in engagement dataset to datatime format for ease of use

In [336]:
df_engage['time_stamp'] = pd.to_datetime(df_engage['time_stamp'])
df_users['last_session_creation_time'] = pd.to_datetime(df_users['last_session_creation_time'])

In [337]:
# let's make the time stamp as the index
df_engage.index=df_engage['time_stamp']
df_engage.drop(labels='time_stamp',axis=1,inplace=True)

In [338]:
#df_users = df_users.rename({"object_id":"user_id"}, axis=1)
df_users.index = df_users['object_id']
df_users.drop(labels='object_id',axis=1,inplace=True)

In [339]:
#Group by user_id and resample to 1 week period, sum over period
df_agg = df_engage.groupby([pd.Grouper(freq='W'),'user_id']).sum() # Grouper will group and resample data as well for each week

In [340]:
# defining an 'adopted user'
#find all user id's with a sum of 3 or more indicating an adopted user
df_adopt = df_agg[df_agg.visited>=3].unstack().melt()
adopted_users = pd.DataFrame(df_adopt.user_id.unique(),index=range(df_adopt.user_id.unique().shape[0]),columns=['user_id'])
adopted = list(adopted_users.user_id)

In [341]:
df_users['adopted_user'] = 0
for user_id in df_users.index:
    if user_id in adopted:
        df_users.loc[user_id, 'adopted_user'] = 1

In [342]:
df_users

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.398138810,1,0,11,10803.0,0
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1970-01-01 00:00:01.396237504,0,0,1,316.0,1
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1970-01-01 00:00:01.363734892,0,0,94,1525.0,0
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.369210168,0,0,1,5151.0,0
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.358849660,0,0,193,5240.0,0
...,...,...,...,...,...,...,...,...,...,...
11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,1970-01-01 00:00:01.378448055,0,0,89,8263.0,0
11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,1970-01-01 00:00:01.358274517,0,0,200,,0
11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,1970-01-01 00:00:01.398602716,1,1,83,8074.0,0
11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,1970-01-01 00:00:01.338638159,0,0,6,,0


In [343]:
df_users['adopted_user'].sum()

1445

So we have a total of 1445 adopted user who logged in at least 3 times in a week.

In [344]:
# Let's map the invited_by_user_id column, so can take into account the null values
invite = lambda row: 0 if np.isnan(row) else 1
df_users["invited_by_user"] = df_users["invited_by_user_id"].apply(invite)

In [345]:
# let's drop some irrelevant columns
df_users = df_users.drop(['creation_time', 'name', 'email', 'last_session_creation_time', 'invited_by_user_id', 'org_id'], axis=1)

In [346]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 1 to 12000
Data columns (total 5 columns):
creation_source               12000 non-null object
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
adopted_user                  12000 non-null int64
invited_by_user               12000 non-null int64
dtypes: int64(4), object(1)
memory usage: 882.5+ KB


In [362]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# machine learning pipeline #
X = df_users.loc[:, df_users.columns != 'adopted_user']
y = df_users['adopted_user']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

pipeline = Pipeline(steps=[("encoder", OneHotEncoder()), \
                           ("rf", RandomForestClassifier(random_state = 42))])

params = {"rf__n_estimators" : [50, 75, 100],
          "rf__max_depth" : [5, 10, 15]}

cv = GridSearchCV(pipeline, param_grid=params, cv=5)
cv.fit(X_train, y_train)

print(f"Best parameters: {cv.best_params_}")
print(f"Training accuracy score from tuned model: \
       {cv.best_score_*100:.1f}%")

Best parameters: {'rf__max_depth': 5, 'rf__n_estimators': 50}
Training accuracy score from tuned model:        88.0%


In [363]:
#test set score
y_pred = cv.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {test_accuracy*100:.2f}%")

Model accuracy: 87.97%


In [369]:
# Created a dummy to replicate the pipeline to get important features

X_dum = pd.get_dummies(X_test)
pipeline.fit(X_dum, y_test)

fe = pipeline.named_steps["rf"].feature_importances_

feature_importance = zip(X_dum.columns, fe)
feature_importance = sorted(feature_importance, key=lambda x:x[1], reverse=True)

for i, j in feature_importance:
    print(f"Weight: {j:.3f} | Feature: {i}")

Weight: 0.103 | Feature: invited_by_user
Weight: 0.086 | Feature: opted_in_to_mailing_list
Weight: 0.084 | Feature: creation_source_GUEST_INVITE
Weight: 0.080 | Feature: enabled_for_marketing_drip
Weight: 0.066 | Feature: creation_source_SIGNUP
Weight: 0.049 | Feature: creation_source_SIGNUP_GOOGLE_AUTH
Weight: 0.020 | Feature: creation_source_PERSONAL_PROJECTS
Weight: 0.015 | Feature: creation_source_ORG_INVITE


From the raw dataset, we've utilized the following as our features:

invited_by_user - if a user was referred by another user \
opted_in_to_mailing_list - whether user has opted into receiving marketing emails \
creation_source_GUEST_INVITE - how the account was created (by a guest invitation) \
enabled_for_marketing_drip - whether they are on the regular marketing email drip 

The model proved itself well having a final accuracy metric comparable to the cross-validation training score (both at ~88%). Which would mean that our pipeline's feature ranking is likewise reliable in determining what a good predictor for user adoption is. Thanks to one-hot encoding we can clearly see quite specifically what the business could do to potentially boost the likelihood user engagement.