In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Import the files

Import the two files, starting with the users files. Let's create a column of boolean corresponding to either being an adopted user or not

In [56]:
users = pd.read_csv("takehome_users.csv",encoding = "ISO-8859-1")
users['adopted_user'] = np.nan
users.head(5)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,


We then import the logins file

In [5]:
userlogins = pd.read_csv("takehome_user_engagement.csv")
userlogins.head(10)

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
5,2013-12-31 03:45:04,2,1
6,2014-01-08 03:45:04,2,1
7,2014-02-03 03:45:04,2,1
8,2014-02-08 03:45:04,2,1
9,2014-02-09 03:45:04,2,1


## Generating the data

We create a table grouped by user_id and which ranks the logins chronologically. We then add a column corresponding to the logins in seconds (with respect to the absolute time) and the day of the logins

In [51]:
# index = [tuple([df1.user_id[i], df1.index[i]]) for i in range(len(userlogins))]
# index = pd.MultiIndex.from_tuples(index, names=['user_id', 'old_index'])

In [23]:
df1 = userlogins.drop(['visited'], axis = 1)
df1.time_stamp = pd.to_datetime(df1.time_stamp)
df1 = df1.sort_values(['user_id','time_stamp'])

In [25]:
df2 = df1.groupby(by=['user_id','time_stamp']).count()
df2['time_seconds']=df2.index.get_level_values(1).strftime('%s').astype(int)
df2['day'] = df2.index.get_level_values(1).date

In [38]:
df2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,time_seconds,day
user_id,time_stamp,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2014-04-22 03:53:30,1398153210,2014-04-22
2,2013-11-15 03:45:04,1384505104,2013-11-15
2,2013-11-29 03:45:04,1385714704,2013-11-29
2,2013-12-09 03:45:04,1386578704,2013-12-09
2,2013-12-25 03:45:04,1387961104,2013-12-25
2,2013-12-31 03:45:04,1388479504,2013-12-31
2,2014-01-08 03:45:04,1389170704,2014-01-08
2,2014-02-03 03:45:04,1391417104,2014-02-03
2,2014-02-08 03:45:04,1391849104,2014-02-08
2,2014-02-09 03:45:04,1391935504,2014-02-09


For each user, we count the number of logins. If that number is less than 3, automatically, the user isn't adopted. If the number is greater or equal than 3, we count the time difference between a login and the two consecutive ones. If both time differences are less than a week (in seconds) and if the three logins are on different days, the user is adopted. By definition, an adopted user needs to satisfy this condition at least once

In [52]:
secondsperweek = 7*24*60*60

for i in range(len(users)+1):
    s1 = df2.query('user_id == '+ str(i))['time_seconds'].tolist()
    s2 = df2.query('user_id == '+ str(i))['day'].tolist()
    nlogins = len(s1)
    if nlogins <= 2:
        users.loc[users.object_id == i,'adopted_user'] = False
    else:
        conds = [ ((s1[j+1] - s1[j]) < secondsperweek) & ((s1[j+2] - s1[j]) < secondsperweek) 
                 & (s2[j+1] != s2[j]) & (s2[j+2] != s2[j]) for j in range(nlogins-2) ]
        if sum(conds) >= 1:
            users.loc[users.object_id == i,'adopted_user'] = True
        else:
            users.loc[users.object_id == i,'adopted_user'] = False

In [53]:
users.adopted_user.value_counts(dropna = False, normalize = True)

False    0.866417
True     0.133583
Name: adopted_user, dtype: float64

Only about 13% of the users are adopted users

In [54]:
users.to_csv('takehome_adopted_users.csv', index = False)

## Cleaning data and Basic analysis

Now that we have our column of adopted users, let's clean the data and generate some features that we think may have an impact on our target: was the user invited? did the user invite others and how many? on what day of the week did the user created their account?

In [6]:
adopted_users = pd.read_csv("takehome_adopted_users.csv",encoding = "ISO-8859-1")

In [7]:
adopted_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,True
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,False
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False


In [8]:
# df['last_session_creation_time'] = pd.to_datetime(df['last_session_creation_time'], unit='s')
df = adopted_users
df = df.set_index('object_id')
df.creation_time = pd.to_datetime(df.creation_time)
df['creation_dayofweek']= df.creation_time.dt.dayofweek
df = df.drop(['name','email','creation_time','last_session_creation_time'], axis = 1)

In [10]:
df['invited_others'] = 0
for i in range(1, len(df)+1):
    try:
        df.loc[ int(df.loc[i]['invited_by_user_id']) , 'invited_others'] += 1
    except:
        pass
    
df['was_invited'] = (df.invited_by_user_id > 1)

#how many times did a user invite another one: not used in our model after all

In [11]:
df.adopted_user = df.adopted_user.astype(int)
df.was_invited = df.was_invited.astype(int)
df = df.drop(['invited_by_user_id'], axis = 1)

In [15]:
df = df[['creation_source','opted_in_to_mailing_list','enabled_for_marketing_drip', 'was_invited',
 'creation_dayofweek','adopted_user']]
df.head()

Unnamed: 0_level_0,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,was_invited,creation_dayofweek,adopted_user
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,GUEST_INVITE,1,0,1,1,0
2,ORG_INVITE,0,0,1,4,1
3,ORG_INVITE,0,0,1,1,0
4,GUEST_INVITE,0,0,1,1,0
5,GUEST_INVITE,0,0,1,3,0


Let's quickly if some of the categorical features, considered individually, have an impact on the ratio of adopted users.

In [30]:
dfca = pd.crosstab(df.creation_source,df.adopted_user)
dfca = dfca.div(dfca.sum(axis=1), axis=0)
display(dfca)

dfra = pd.crosstab(df.was_invited,df.adopted_user)
dfra = dfra.div(dfra.sum(axis=1), axis=0)
display(dfra)

pd.crosstab(df.was_invited,df.creation_source)

adopted_user,0,1
creation_source,Unnamed: 1_level_1,Unnamed: 2_level_1
GUEST_INVITE,0.833564,0.166436
ORG_INVITE,0.870005,0.129995
PERSONAL_PROJECTS,0.922312,0.077688
SIGNUP,0.859607,0.140393
SIGNUP_GOOGLE_AUTH,0.831769,0.168231


adopted_user,0,1
was_invited,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.876411,0.123589
1,0.857722,0.142278


creation_source,GUEST_INVITE,ORG_INVITE,PERSONAL_PROJECTS,SIGNUP,SIGNUP_GOOGLE_AUTH
was_invited,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,2111,2087,1385
1,2163,4254,0,0,0


In [115]:
df.to_csv('takehome_adopted_users_clean.csv', index = False)

## Predictive Model

Let's now build a classifier using ... Through that model, can we find the factors that predict futur user adoption? Are there any that negativaly correlate with adoption?

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV,  cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score 
from sklearn.metrics import log_loss, precision_recall_curve, precision_score

#df = pd.read_csv("takehome_adopted_users_clean.csv")

In [37]:
df2 = df.copy()
df2['creation_monday'] = (df2.creation_dayofweek == 0).astype(int)
df2['creation_tuesday'] = (df2.creation_dayofweek == 1).astype(int)
df2['creation_wednesday'] = (df2.creation_dayofweek == 2).astype(int)
df2['creation_thursday'] = (df2.creation_dayofweek == 3).astype(int)
df2['creation_friday'] = (df2.creation_dayofweek == 4).astype(int)
df2['creation_saturday'] = (df2.creation_dayofweek == 5).astype(int)

df2 = df2.join(pd.get_dummies(df2.creation_source, drop_first=True))
df2 = df2.drop(['creation_source','creation_dayofweek','was_invited'], axis = 1)
df2 = df2[['adopted_user','opted_in_to_mailing_list', 'enabled_for_marketing_drip', #'was_invited',
         'creation_monday', 'creation_tuesday','creation_wednesday', 
           'creation_thursday', 'creation_friday','creation_saturday',
           'ORG_INVITE', 'PERSONAL_PROJECTS', 'SIGNUP','SIGNUP_GOOGLE_AUTH']]

In [38]:
display(df2.head())
df2.adopted_user.value_counts()

Unnamed: 0_level_0,adopted_user,opted_in_to_mailing_list,enabled_for_marketing_drip,creation_monday,creation_tuesday,creation_wednesday,creation_thursday,creation_friday,creation_saturday,ORG_INVITE,PERSONAL_PROJECTS,SIGNUP,SIGNUP_GOOGLE_AUTH
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,1,0,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,1,0,1,0,0,0
3,0,0,0,0,1,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,0,0,0,0,0


0    10397
1     1603
Name: adopted_user, dtype: int64

In [39]:
y = df2.adopted_user
X = df2.drop('adopted_user', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# pipeline = Pipeline([('classifier', RandomForestClassifier())])
# search_space = [{'classifier': [LogisticRegression()],
#                  'classifier__C': np.logspace(0, 4, 10)},
#                 {'classifier': [KNeighborsClassifier()],
#                  'classifier__n_neighbors': [3,4]},
#                 {'classifier': [RandomForestClassifier()],
#                  'classifier__n_estimators': [10, 100, 1000],
#                  'classifier__max_features': [1, 2, 3]}]
# gm_cv = GridSearchCV(pipeline, search_space, cv=5, scoring = 'recall')
# best_model = gm_cv.fit(X_train, y_train)
# best_model.best_estimator_.get_params()['classifier']

pipeline = Pipeline([('classifier', LogisticRegression(class_weight='balanced', random_state = 0))])
search_space = {'classifier__C': np.logspace(-5, 4, 10)}
gm_cv = GridSearchCV(pipeline, search_space, cv=5, scoring = 'recall')
gm_cv.fit(X_train, y_train)

y_pred = gm_cv.predict(X_test)
print(classification_report(y_pred, y_test))
print(gm_cv.score(X_test, y_test))

              precision    recall  f1-score   support

           0       0.41      0.90      0.56       959
           1       0.68      0.14      0.23      1441

    accuracy                           0.44      2400
   macro avg       0.54      0.52      0.40      2400
weighted avg       0.57      0.44      0.36      2400

0.6767676767676768


In [40]:
final = gm_cv.best_estimator_.get_params()['classifier']
coefs = final.coef_[0]
b = final.intercept_[0]

In [46]:
t1 = X.columns.tolist()
t2 = coefs
t = pd.DataFrame.from_dict({'features': t1, 'beta_coefs':t2, 'ratio odds': np.exp(t2)})
t.sort_values(by = 'beta_coefs', ascending = False)[['features','ratio odds']].head(15)

Unnamed: 0,features,ratio odds
11,SIGNUP_GOOGLE_AUTH,1.007098
0,opted_in_to_mailing_list,1.002682
1,enabled_for_marketing_drip,1.002551
7,creation_saturday,1.002515
4,creation_wednesday,1.001635
10,SIGNUP,1.001572
3,creation_tuesday,0.99954
5,creation_thursday,0.999206
6,creation_friday,0.998641
8,ORG_INVITE,0.998444


$ \text{logit}(p) = \log \left( \frac{p}{1 - p} \right) = \beta_0 + \sum_i \beta_i x_i$

Ratio of the odds for feature $i$ is $ \exp(\beta_i)$

Check writeup found in this directory for comments on results