In [1]:
# install packages 
import pandas as pd
import chardet
from collections import Counter 

In [2]:
with open('takehome_users.csv', 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'ISO-8859-1', 'confidence': 0.7294372453287324, 'language': ''}

In [3]:
users = pd.read_csv('takehome_users.csv',encoding='ISO-8859-1')

In [4]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [5]:
engagement = pd.read_csv('takehome_user_engagement.csv')

In [6]:
engagement = engagement.sort_values(by='time_stamp')

In [7]:
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
178140,2012-05-31 08:20:06,10012,1
59486,2012-05-31 15:47:36,3428,1
175638,2012-05-31 17:19:37,9899,1
26821,2012-05-31 21:58:33,1693,1
109716,2012-06-01 00:17:30,6102,1


In [8]:
engagement['user_id'].value_counts()

3623     606
906      600
1811     593
7590     590
8068     585
        ... 
6763       1
3773       1
5822       1
10040      1
2047       1
Name: user_id, Length: 8823, dtype: int64

In [9]:
engagement['visited'].value_counts()

1    207917
Name: visited, dtype: int64

In [10]:
engagement.sort_values(by='time_stamp', ascending=False, inplace=True)

In [11]:
engagement['time_stamp'] = pd.to_datetime(engagement['time_stamp'])
engagement.set_index('time_stamp', inplace=True)
engagement.head()

Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-06-06 14:58:50,4051,1
2014-06-04 23:56:26,341,1
2014-06-04 23:46:31,9558,1
2014-06-04 23:34:04,9325,1
2014-06-04 23:32:13,4625,1


In [12]:
## Create response variable. 
labels = dict.fromkeys(engagement.user_id)
for x in labels:
    logins = engagement.loc[engagement.user_id == x]
    weekly_window = logins.rolling('7d').count()
    if weekly_window.loc[weekly_window.visited >= 3].shape[0] > 0:
        adopted = 1
    else:
        adopted = 0
    labels[x] = adopted

In [13]:
labels = pd.DataFrame.from_dict(labels, orient='index')

In [14]:
labels.reset_index(inplace=True)

In [15]:
labels.rename(columns={'index':'object_id',0:'adopted_user'}, inplace=True)

In [16]:
labels['adopted_user'].value_counts()

0    7221
1    1602
Name: adopted_user, dtype: int64

In [17]:
labels.head()

Unnamed: 0,object_id,adopted_user
0,4051,0
1,341,1
2,9558,1
3,9325,1
4,4625,1


In [18]:
final_table = pd.merge(users,labels,on='object_id')

In [19]:
final_table

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1.398139e+09,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1.396238e+09,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1.363735e+09,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1.369210e+09,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1.358850e+09,0,0,193,5240.0,0
...,...,...,...,...,...,...,...,...,...,...,...
8818,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,1.378448e+09,0,0,89,8263.0,0
8819,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,1.358275e+09,0,0,200,,0
8820,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,1.398603e+09,1,1,83,8074.0,0
8821,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,1.338638e+09,0,0,6,,0


## Data Cleaning and Feature Engineering

In [20]:
## the user's id number, the date and time they created a profile, thier name, thier email provide no useful help in our 
## model. It is best to remove them before we build our model.  

In [21]:
final_table.drop(columns=['creation_time','name','email','last_session_creation_time','object_id', 'org_id'],inplace=True)

In [22]:
final_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8823 entries, 0 to 8822
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   creation_source             8823 non-null   object 
 1   opted_in_to_mailing_list    8823 non-null   int64  
 2   enabled_for_marketing_drip  8823 non-null   int64  
 3   invited_by_user_id          4776 non-null   float64
 4   adopted_user                8823 non-null   int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 413.6+ KB


In [23]:
## convert the creation_source columns to one hot encoding.
final_table = pd.get_dummies(final_table)

In [24]:
## lets convert the feature 'incited_by_user_id' to a binary feature: 1 if the user was refred by someone else, and 0 
## otherwise. 

In [25]:
store = []
for x in final_table['invited_by_user_id']:
    if pd.isna(x) == True:
        store.append(0)
    else:
        store.append(1)

In [26]:
final_table['invited_by_user_id'] = store

In [27]:
final_table

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,invited_by_user_id,adopted_user,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH
0,1,0,1,0,1,0,0,0,0
1,0,0,1,1,0,1,0,0,0
2,0,0,1,0,0,1,0,0,0
3,0,0,1,0,1,0,0,0,0
4,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
8818,0,0,1,0,0,1,0,0,0
8819,0,0,0,0,0,0,0,0,1
8820,1,1,1,0,1,0,0,0,0
8821,0,0,0,0,0,0,1,0,0


## Model Building 

In [28]:
final_table['adopted_user'].value_counts() ## Highly imbalanced 

0    7221
1    1602
Name: adopted_user, dtype: int64

In [29]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier 
X = final_table.drop(columns=['adopted_user'])
y = final_table['adopted_user']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=42)
grid_params = {
    'n_estimators':[100, 200, 300, 400],
    'warm_start':[True, False],
    'bootstrap':[True, False],
    'class_weight': ['balanced','balanced_subsample'],
    'max_features': ['auto','log2',None],
    'criterion': ['gini','entropy'],
}

In [31]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                          param_grid = grid_params,
                          scoring='recall',
                          cv=10,
                          n_jobs=-1)

In [32]:
grid_search.fit(X_train, y_train)

  warn('class_weight presets "balanced" or '


GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
             n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'class_weight': ['balanced', 'balanced_subsample'],
                         'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'log2', None],
                         'n_estimators': [100, 200, 300, 400],
                         'warm_start': [True, False]},
             scoring='recall')

In [33]:
best_params = grid_search.best_params_

In [34]:
best_params

{'bootstrap': False,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_features': 'auto',
 'n_estimators': 100,
 'warm_start': True}

In [35]:
best_recall_score = grid_search.best_score_

In [36]:
best_recall_score

0.4120859892513436

In [37]:
## Now we have the parmaters of the best model. Let's predict the score for the test set. 
y_predicted = grid_search.predict(X_test)

In [38]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.82      0.64      0.72      1427
           1       0.22      0.41      0.28       338

    accuracy                           0.60      1765
   macro avg       0.52      0.53      0.50      1765
weighted avg       0.71      0.60      0.64      1765



In [39]:
## we get poor predictiosn of the minority class: lets re-run using a balanced dataset.Utilizing ADASYN analalysis will 
## oversample the minorty class to produce a balanced datatset. This technique tries to predict the hearder to learn 
## instances of the minority class, rather than just ggenrating smaples based on the sample space. 
from imblearn.over_sampling import ADASYN
smt = ADASYN()
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

In [40]:
grid_search.fit(X_train_sm, y_train_sm)

  warn('class_weight presets "balanced" or '


GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
             n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'class_weight': ['balanced', 'balanced_subsample'],
                         'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'log2', None],
                         'n_estimators': [100, 200, 300, 400],
                         'warm_start': [True, False]},
             scoring='recall')

In [41]:
best_params = grid_search.best_params_

In [42]:
best_params

{'bootstrap': True,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_features': 'auto',
 'n_estimators': 200,
 'warm_start': True}

In [43]:
best_recall_score = grid_search.best_score_

In [44]:
best_recall_score

0.6251859924981991

In [45]:
y_predicted = grid_search.predict(X_test)

In [46]:
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.83      0.44      0.57      1427
           1       0.21      0.63      0.31       338

    accuracy                           0.47      1765
   macro avg       0.52      0.53      0.44      1765
weighted avg       0.71      0.47      0.52      1765



In [47]:
## Utilising the over-sampling technique, the model is able to better predict the minority class. the model has been 
## optimized to best recognize the minority class. 

## Feature Importance

In [48]:
model = RandomForestClassifier(n_estimators=200,warm_start=True, class_weight='balanced', random_state=42)

In [49]:
model.fit(X_train_sm, y_train_sm)

  warn('class_weight presets "balanced" or '


RandomForestClassifier(class_weight='balanced', n_estimators=200,
                       random_state=42, warm_start=True)

In [50]:
importance_coefficents = model.feature_importances_*100

In [51]:
feature_names = X_train_sm.columns

In [52]:
important_coeffecient_list = list(zip(feature_names, importance_coefficents))

In [53]:
important_coefficent_list = pd.DataFrame(important_coeffecient_list)

In [54]:
important_coefficent_list.rename(columns={0:'features',1:'importance (%)'},inplace=True)

In [55]:
important_coefficent_list

Unnamed: 0,features,importance (%)
0,opted_in_to_mailing_list,22.400993
1,enabled_for_marketing_drip,17.81798
2,invited_by_user_id,14.824273
3,creation_source_GUEST_INVITE,12.251505
4,creation_source_ORG_INVITE,5.65052
5,creation_source_PERSONAL_PROJECTS,5.429132
6,creation_source_SIGNUP,6.239601
7,creation_source_SIGNUP_GOOGLE_AUTH,15.385995


#### Using the feature importance method, it seems that the features that are most important for determining whether a user
#### will be an adopted user will depend on:
####  - Whether or not they opted into the mailing list 
####  - Whether or not they subscipted to the marketing email list 

#### The creation source that was most helpful in determining the adoptability rate of user was when a user used a google 
#### email to set up thier user id. 