# Relax Inc Challenge

Defining  an  "adopted  user"   as  a  user  who   has  logged  into  the  product  on  three  separate days  in  at  least  one  seven day  period, identify  which  factors  predict  future  user adoption.


In [106]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from datetime import timedelta
import datetime as dt

In [107]:
eng = pd.read_csv('takehome_user_engagement.csv')
user = pd.read_csv('takehome_users.csv', encoding='latin-1')

## Identifying Adopted Users

In [108]:
eng.head(10)

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
5,2013-12-31 03:45:04,2,1
6,2014-01-08 03:45:04,2,1
7,2014-02-03 03:45:04,2,1
8,2014-02-08 03:45:04,2,1
9,2014-02-09 03:45:04,2,1


In [109]:
eng.dtypes

time_stamp    object
user_id        int64
visited        int64
dtype: object

In [110]:
eng['time_stamp'] = pd.to_datetime(eng['time_stamp'], format='%Y-%m-%d %H:%M:%S')

In [111]:
eng.sort_values(by='user_id', inplace=True)

In [112]:
eng.reset_index(inplace=True, drop=True)

In [113]:
eng

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2014-03-31 03:45:04,2,1
2,2014-03-13 03:45:04,2,1
3,2014-03-09 03:45:04,2,1
4,2014-02-16 03:45:04,2,1
...,...,...,...
207912,2013-09-06 06:14:15,11996,1
207913,2013-01-15 18:28:37,11997,1
207914,2014-04-27 12:45:16,11998,1
207915,2012-06-02 11:55:59,11999,1


In [114]:
# group by user id and each week they visited
weekly_sum = eng.groupby(['user_id', eng['time_stamp'].dt.strftime('%W')])['visited'].sum()
weekly_sum

user_id  time_stamp
1        16            1
2        01            1
         05            3
         06            2
         09            1
                      ..
11996    35            1
11997    02            1
11998    16            1
11999    22            1
12000    03            1
Name: visited, Length: 57910, dtype: int64

In [115]:
weekly_sum = weekly_sum.reset_index()

In [116]:
weekly_sum

Unnamed: 0,user_id,time_stamp,visited
0,1,16,1
1,2,01,1
2,2,05,3
3,2,06,2
4,2,09,1
...,...,...,...
57905,11996,35,1
57906,11997,02,1
57907,11998,16,1
57908,11999,22,1


In [117]:
# only keep max number of times visited because it's enough to tell whether someone is active
max_visits = weekly_sum.groupby('user_id').agg({'visited':'max'})
max_visits

Unnamed: 0_level_0,visited
user_id,Unnamed: 1_level_1
1,1
2,3
3,1
4,1
5,1
...,...
11996,1
11997,1
11998,1
11999,1


In [118]:
max_visits.loc[max_visits['visited'] >= 3, 'active'] = 1
max_visits.loc[max_visits['visited'] < 3, 'active'] = 0

In [119]:
max_visits = max_visits.reset_index()
max_visits

Unnamed: 0,user_id,visited,active
0,1,1,0.0
1,2,3,1.0
2,3,1,0.0
3,4,1,0.0
4,5,1,0.0
...,...,...,...
8818,11996,1,0.0
8819,11997,1,0.0
8820,11998,1,0.0
8821,11999,1,0.0


In [120]:
user.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [121]:
user = user.merge(max_visits, how='inner', left_on='object_id', right_on='user_id')
user.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_id,visited,active
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,1,1,0.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,2,3,1.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,3,1,0.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,4,1,0.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,5,1,0.0


## Preparing the Data

In [122]:
user.describe()

Unnamed: 0,object_id,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_id,visited,active
count,8823.0,8823.0,8823.0,8823.0,8823.0,4776.0,8823.0,8823.0,8823.0
mean,6019.821716,1379279000.0,0.252295,0.151989,142.572254,5980.495394,6019.821716,1.974838,0.163663
std,3464.251001,19531160.0,0.434354,0.359031,124.176422,3394.211361,3464.251001,2.442177,0.369991
min,1.0,1338452000.0,0.0,0.0,0.0,3.0,1.0,1.0,0.0
25%,3017.5,1363195000.0,0.0,0.0,30.0,3071.0,3017.5,1.0,0.0
50%,6034.0,1382888000.0,0.0,0.0,109.0,5947.0,6034.0,1.0,0.0
75%,9029.5,1398443000.0,1.0,0.0,239.0,8857.25,9029.5,1.0,0.0
max,12000.0,1402067000.0,1.0,1.0,416.0,11999.0,12000.0,14.0,1.0


In [123]:
user.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8823 entries, 0 to 8822
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   8823 non-null   int64  
 1   creation_time               8823 non-null   object 
 2   name                        8823 non-null   object 
 3   email                       8823 non-null   object 
 4   creation_source             8823 non-null   object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    8823 non-null   int64  
 7   enabled_for_marketing_drip  8823 non-null   int64  
 8   org_id                      8823 non-null   int64  
 9   invited_by_user_id          4776 non-null   float64
 10  user_id                     8823 non-null   int64  
 11  visited                     8823 non-null   int64  
 12  active                      8823 non-null   float64
dtypes: float64(3), int64(6), object(4

In [125]:
# change invited by user_id to 1 for yes and 0 for no 
user.loc[~user['invited_by_user_id'].isnull(), 'invited_by_user_id'] = 1
user.loc[user['invited_by_user_id'].isnull(), 'invited_by_user_id'] = 0

In [126]:
user.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_id,visited,active
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,1.0,1,1,0.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,1.0,2,3,1.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1.0,3,1,0.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,1.0,4,1,0.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,1.0,5,1,0.0


In [104]:
user['creation_source'].value_counts()

ORG_INVITE            3188
SIGNUP                1898
GUEST_INVITE          1588
SIGNUP_GOOGLE_AUTH    1385
PERSONAL_PROJECTS      764
Name: creation_source, dtype: int64

In [128]:
df = pd.get_dummies(user, columns=['creation_source'])
df

Unnamed: 0,object_id,creation_time,name,email,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_id,visited,active,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,1.398139e+09,1,0,11,1.0,1,1,0.0,1,0,0,0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,1.396238e+09,0,0,1,1.0,2,3,1.0,0,1,0,0,0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,1.363735e+09,0,0,94,1.0,3,1,0.0,0,1,0,0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,1.369210e+09,0,0,1,1.0,4,1,0.0,1,0,0,0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,1.358850e+09,0,0,193,1.0,5,1,0.0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8818,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,1.378448e+09,0,0,89,1.0,11996,1,0.0,0,1,0,0,0
8819,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,1.358275e+09,0,0,200,0.0,11997,1,0.0,0,0,0,0,1
8820,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,1.398603e+09,1,1,83,1.0,11998,1,0.0,1,0,0,0,0
8821,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,1.338638e+09,0,0,6,0.0,11999,1,0.0,0,0,1,0,0


## Building the Model

In [155]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [129]:
df.columns

Index(['object_id', 'creation_time', 'name', 'email',
       'last_session_creation_time', 'opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id', 'user_id',
       'visited', 'active', 'creation_source_GUEST_INVITE',
       'creation_source_ORG_INVITE', 'creation_source_PERSONAL_PROJECTS',
       'creation_source_SIGNUP', 'creation_source_SIGNUP_GOOGLE_AUTH'],
      dtype='object')

In [147]:
X = df[['opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'invited_by_user_id', 
     'creation_source_GUEST_INVITE',
       'creation_source_ORG_INVITE', 'creation_source_PERSONAL_PROJECTS',
       'creation_source_SIGNUP', 'creation_source_SIGNUP_GOOGLE_AUTH']]
y = df['active']

In [148]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)

In [149]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
lr.score(X_test, y_test)

0.827742520398912

In [156]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1826    0]
 [ 380    0]]


In [157]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91      1826
         1.0       0.00      0.00      0.00       380

    accuracy                           0.83      2206
   macro avg       0.41      0.50      0.45      2206
weighted avg       0.69      0.83      0.75      2206



  _warn_prf(average, modifier, msg_start, len(result))


In [150]:
print(lr.coef_)

[[ 0.02368217  0.08212832  0.04388469  0.20810285 -0.16421816  0.14474192
  -0.12371024 -0.06550849]]


In [158]:
coefficients = [0.02368217,  0.08212832,  0.04388469,  0.20810285, -0.16421816,  0.14474192,
  -0.12371024, -0.06550849]
features = ['opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'invited_by_user_id', 
     'creation_source_GUEST_INVITE',
       'creation_source_ORG_INVITE', 'creation_source_PERSONAL_PROJECTS',
       'creation_source_SIGNUP', 'creation_source_SIGNUP_GOOGLE_AUTH']

In [159]:
tuples = list(zip(features, coefficients))
coef_df = pd.DataFrame(tuples, columns=['Features', 'Coefficients'])
coef_df

Unnamed: 0,Features,Coefficients
0,opted_in_to_mailing_list,0.023682
1,enabled_for_marketing_drip,0.082128
2,invited_by_user_id,0.043885
3,creation_source_GUEST_INVITE,0.208103
4,creation_source_ORG_INVITE,-0.164218
5,creation_source_PERSONAL_PROJECTS,0.144742
6,creation_source_SIGNUP,-0.12371
7,creation_source_SIGNUP_GOOGLE_AUTH,-0.065508


I've created a logistic regression model with about 82.7% accuracy in predicting future user adoption. By examining the coefficients, we can show that the features that have the highest impact in increasing future user adoption are guest invites and personal projects as creation sources.   