In [263]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime, date, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [251]:
# import data
df_user = pd.read_csv('takehome_users.csv',index_col=False)
engagement = pd.read_csv('takehome_user_engagement.csv', index_col=False)

In [253]:
# group engagement data by user_id
eng_group = engagement.groupby('user_id')['time_stamp'].count()
eng_group = eng_group.reset_index()
eng_group.columns=['user_id', 'count']
eng_group = eng_group.loc[eng_group['count']>=3]

In [254]:
# replace missing value
df_user['last_session_creation_time'] = df_user['last_session_creation_time'].replace(np.NaN, 
                                              df_user['last_session_creation_time'].mean())
# calculate days between today and creation_time 
df_user['since_creation'] = pd.to_datetime(df_user['creation_time'],
                                 format="%Y-%m-%d %H:%M:%S").apply(lambda x: 
                                                                   (pd.to_datetime('today')-x).days)

In [255]:
# find adopted_user
adopted_user=[]
for user_id in eng_group['user_id']:
    users = engagement.loc[engagement['user_id'] == user_id].copy()
    users['date']=pd.to_datetime(users['time_stamp'],
                                 format="%Y-%m-%d %H:%M:%S").apply(lambda x: x.date())
    users = users.drop_duplicates(subset=['date'])
    count = 1
    login_time=pd.to_datetime(users['time_stamp'],format="%Y-%m-%d %H:%M:%S").tolist()
    for time in login_time:
        plus7 = time+timedelta(days=7)
        for i in range(login_time.index(time), login_time.index(login_time[-1])+1):
            if login_time[i]>time and login_time[i]<=plus7:
                count=count+1
        if count>2:
            adopted_user.append(user_id)
            break
        count=1               

In [257]:
# mark adopted_user in df_user dataset
df_user_adopted_user = []
for object_id in df_user['object_id']:
    if object_id in adopted_user:
        df_user_adopted_user.append(1)
    else:
        df_user_adopted_user.append(0)
df_user['adopted_user']=df_user_adopted_user
df_user.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,since_creation,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,1621,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1779,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,2020,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,1957,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,2081,0


In [259]:
# preprocessing date before ML
X = df_user.iloc[:, :-1]
y = df_user.iloc[:, -1]
X.drop(['object_id', 'creation_time', 'name', 'email', 'invited_by_user_id'], 
       axis=1, inplace=True)
X = pd.get_dummies(X, dummy_na=True)
columns = X.columns

X=scale(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [266]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=2, random_state=0)
rf.fit(X, y)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print(accuracy_score(y_test, rf_pred))
print(f1_score(y_test, rf_pred))
feature_list = list(columns)
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance 
                       in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:16} Importance: {}'.format(*pair)) for pair in feature_importances];

0.963611111111
0.855567805954
Variable: last_session_creation_time Importance: 0.69
Variable: since_creation   Importance: 0.21
Variable: org_id           Importance: 0.08
Variable: opted_in_to_mailing_list Importance: 0.01
Variable: enabled_for_marketing_drip Importance: 0.0
Variable: creation_source_GUEST_INVITE Importance: 0.0
Variable: creation_source_ORG_INVITE Importance: 0.0
Variable: creation_source_PERSONAL_PROJECTS Importance: 0.0
Variable: creation_source_SIGNUP Importance: 0.0
Variable: creation_source_SIGNUP_GOOGLE_AUTH Importance: 0.0
Variable: creation_source_nan Importance: 0.0


According to the feature importance shown above, last_session_creation_time is the most important feature in predicting whether the user would be an adopted user or not. Also, creation_time is another factor that affects the result.