In [1]:
import pandas as pd
import numpy as np
import chardet
import seaborn as sns

In [2]:
# Detect encoding type with chardet
chardet.detect(open('takehome_users.csv', 'rb').read())

{'encoding': 'ISO-8859-1', 'confidence': 0.7298523315812625, 'language': ''}

In [3]:
users = pd.read_csv('takehome_users.csv', encoding='ISO-8859-1')
users.head(3)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0


In [4]:
engagement = pd.read_csv('takehome_user_engagement.csv')
engagement.head(3)

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1


In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [6]:
# Convert creation time to datetime
users['creation_time'] = pd.to_datetime(users['creation_time'])

In [7]:
engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [8]:
# Convert 'timestamp' to datetime
engagement['time_stamp'] = pd.to_datetime(engagement['time_stamp'])

In [9]:
# get a count of the number of visits by user
visit_counts = engagement.groupby(by=['user_id']).count()[['visited']]
visit_counts.head()

Unnamed: 0_level_0,visited
user_id,Unnamed: 1_level_1
1,1
2,14
3,1
4,1
5,1


In [10]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [11]:
user_id = users[['object_id']].copy(deep=True)
user_id.set_index('object_id', inplace=True)

In [12]:
user_id.head(3)

1
2
3


In [13]:
# users['object_id'] aka user_id gives the complete list of users, so join visit_counts to that.
# if a user id doesn't exist in visit_counts, that row will be "0" for "visited"
visit_counts = user_id.join(visit_counts, how='left').fillna(0)

In [14]:
visit_counts.head(3)

Unnamed: 0_level_0,visited
object_id,Unnamed: 1_level_1
1,1.0
2,14.0
3,1.0


Now we will start building the target variable. We will start with a column of NaNs, and put "0" in the 'adopted' rows where the 'visited' count is less than 3. We know these users cannot be 'adopted'. The remaining nulls can then be collected as potentials depending on whether their 3+ visits occurred during a period of at least 7 days.

In [15]:
visit_counts['adopted'] = np.nan
visit_counts.loc[visit_counts['visited'] < 3, 'adopted'] = 0 

In [16]:
visit_counts.head(3)

Unnamed: 0_level_0,visited,adopted
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,0.0
2,14.0,
3,1.0,0.0


In [17]:
engagement.head(3)

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1


In [18]:
# add all indices from visit_counts who had at least three visits to a list
three_or_more_vc = visit_counts[visit_counts['adopted'].isnull()].index.values.tolist()

# find all the rows from engagement where 'user_id' matches the newly created list
three_or_more_eng = engagement[engagement['user_id'].isin(three_or_more_vc)]

In [19]:
three_or_more_eng.head(3)

Unnamed: 0,time_stamp,user_id,visited
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1


In [20]:
def get_grouped_date_range(date):
    return (max(date)-min(date))

In [21]:
transform = three_or_more_eng.groupby('user_id')['time_stamp'].transform(get_grouped_date_range)
transform.head(3)

1   136 days
2   136 days
3   136 days
Name: time_stamp, dtype: timedelta64[ns]

In [22]:
time_period_gb = three_or_more_eng.groupby('user_id')['time_stamp'].apply(get_grouped_date_range)
time_period_gb.head(3)

user_id
2    136 days
10   503 days
20    79 days
Name: time_stamp, dtype: timedelta64[ns]

In [23]:
time_period_df = time_period_gb.to_frame()
time_period_df.head(3)

Unnamed: 0_level_0,time_stamp
user_id,Unnamed: 1_level_1
2,136 days
10,503 days
20,79 days


In [24]:
# Convert 'time_stamp' column from time_delta to int
days = []

for index, row in time_period_df.iterrows():
    days.append(row['time_stamp'].days)

time_period_df['time_stamp'] = days

In [25]:
# How many users who logged in at least three times did so during a 7-day period or greater
sum(time_period_df['time_stamp'] >= 7)/len(time_period_df)

0.99644128113879

In [26]:
users_7_or_greater = time_period_df[time_period_df['time_stamp'] >= 7].index.values.tolist()

In [27]:
len(users_7_or_greater)

2240

In [28]:
visit_counts.loc[visit_counts.index.isin(users_7_or_greater), 'adopted'] = 1
visit_counts.head(3)

Unnamed: 0_level_0,visited,adopted
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,0.0
2,14.0,1.0
3,1.0,0.0


In [29]:
# There are a few users whose logins occurred in less than 7 days. These need to be "0" for 'adopted'.
users_under_7 = time_period_df[time_period_df['time_stamp'] < 7].index.values.tolist()

visit_counts.loc[visit_counts.index.isin(users_under_7)] = 0

In [30]:
# Make sure there are no nulls in 'adopted'
visit_counts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 1 to 12000
Data columns (total 2 columns):
visited    12000 non-null float64
adopted    12000 non-null float64
dtypes: float64(2)
memory usage: 601.2 KB


In [31]:
# join users and visit_counts dataframes
users = users.join(visit_counts)

In [32]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 12 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
visited                       11999 non-null float64
adopted                       11999 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(4), object(3)
memory usage: 1.1+ MB


In [33]:
# Remove columns that cannot be converted to numeric type
users.drop(['name', 'email'], axis=1, inplace=True)

In [34]:
# Remove 'object_id' because we don't want to train a model based off of specific users
users.drop(['object_id'], axis=1, inplace=True)

In [35]:
# Remove 'visited' column because it was used to create the target
users.drop(['visited'], axis=1, inplace=True)

In [36]:
# Convert creation time from datetime to integer
users['creation_time'] = users['creation_time'].dt.strftime('%Y%m%d').astype(int)

In [37]:
# one-hot-encode 'creation_source' column
creation_source_encoded = pd.get_dummies(users['creation_source'])

In [38]:
users = users.join(creation_source_encoded)

In [39]:
users.drop(['creation_source'], axis=1, inplace=True)

In [40]:
# move target variable to the end
last_col = ['adopted']
users = users[[col for col in users if col not in last_col]
           + [col for col in last_col]]

We have two columns with significant nulls. We will replace nulls with "0" in both cases, representing an absence of the event. We can check how the model performs with these features, and remove them later if needed. The remaning nulls are few enough that they will not impact the model. We will replace these with a "0" as well.

In [41]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 12 columns):
creation_time                 12000 non-null int64
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
GUEST_INVITE                  12000 non-null uint8
ORG_INVITE                    12000 non-null uint8
PERSONAL_PROJECTS             12000 non-null uint8
SIGNUP                        12000 non-null uint8
SIGNUP_GOOGLE_AUTH            12000 non-null uint8
adopted                       11999 non-null float64
dtypes: float64(3), int64(4), uint8(5)
memory usage: 714.9 KB


In [42]:
users.fillna(0, inplace=True)
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 12 columns):
creation_time                 12000 non-null int64
last_session_creation_time    12000 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            12000 non-null float64
GUEST_INVITE                  12000 non-null uint8
ORG_INVITE                    12000 non-null uint8
PERSONAL_PROJECTS             12000 non-null uint8
SIGNUP                        12000 non-null uint8
SIGNUP_GOOGLE_AUTH            12000 non-null uint8
adopted                       12000 non-null float64
dtypes: float64(3), int64(4), uint8(5)
memory usage: 714.9 KB


## Feature Selection

In [43]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [44]:
# Dvide the data into features and target
X = users.drop('adopted', axis=1)
y = users.iloc[:, -1]

In [45]:
# Scale the features
scaler = MinMaxScaler()
scaler.fit(X)
scaler.fit_transform(X)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


array([[0.99459973, 0.99719848, 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.52922646, 0.9958424 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.48942447, 0.97266048, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.99484974, 0.99752935, 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.95476066, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.97979899, 0.99191187, 0.        , ..., 0.        , 1.        ,
        0.        ]])

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [47]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
predictions = tree.predict(X_test)

In [48]:
tree.feature_importances_

array([0.23987987, 0.22971881, 0.02589543, 0.00062629, 0.31118808,
       0.14844949, 0.00449125, 0.0063063 , 0.00976115, 0.01280315,
       0.01088019])

In [49]:
for index, value in enumerate(tree.feature_importances_):
    print("{:30}{:f}".format(X.columns[index], value))

creation_time                 0.239880
last_session_creation_time    0.229719
opted_in_to_mailing_list      0.025895
enabled_for_marketing_drip    0.000626
org_id                        0.311188
invited_by_user_id            0.148449
GUEST_INVITE                  0.004491
ORG_INVITE                    0.006306
PERSONAL_PROJECTS             0.009761
SIGNUP                        0.012803
SIGNUP_GOOGLE_AUTH            0.010880


Try creating a new feature for simply whether or not a user was invited by another user, rather than the id of the user who gave the invite.

In [50]:
users['invited_by_user'] = users['invited_by_user_id'] > 0

In [51]:
# move target variable to the end
last_col = ['adopted']
users = users[[col for col in users if col not in last_col]
         +[col for col in last_col]]

Try running the model again.

In [52]:
# Dvide the data into features and target
X = users.drop('adopted', axis=1)
y = users.iloc[:, -1]

In [53]:
# Scale the features
scaler = MinMaxScaler()
scaler.fit(X)
scaler.fit_transform(X)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


array([[0.99459973, 0.99719848, 1.        , ..., 0.        , 0.        ,
        1.        ],
       [0.52922646, 0.9958424 , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.48942447, 0.97266048, 0.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.99484974, 0.99752935, 1.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.95476066, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.97979899, 0.99191187, 0.        , ..., 1.        , 0.        ,
        0.        ]])

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [55]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
predictions = tree.predict(X_test)

In [56]:

print(confusion_matrix)

<function confusion_matrix at 0x1a1cbf2d90>


In [57]:
tree.feature_importances_

array([0.22847149, 0.2340314 , 0.03025691, 0.02082603, 0.28807441,
       0.14830779, 0.00044354, 0.00390906, 0.0136264 , 0.01621019,
       0.01158449, 0.00425829])

In [58]:
for index, value in enumerate(tree.feature_importances_):
    print("{:30}{:f}".format(X.columns[index], value))

creation_time                 0.228471
last_session_creation_time    0.234031
opted_in_to_mailing_list      0.030257
enabled_for_marketing_drip    0.020826
org_id                        0.288074
invited_by_user_id            0.148308
GUEST_INVITE                  0.000444
ORG_INVITE                    0.003909
PERSONAL_PROJECTS             0.013626
SIGNUP                        0.016210
SIGNUP_GOOGLE_AUTH            0.011584
invited_by_user               0.004258


In [59]:
# Place feature importances in a sorted DataFrame
l_index = []
v_index = []
for index, value in enumerate(tree.feature_importances_):
    l_index.append(X.columns[index])
    v_index.append(value)
fi = pd.DataFrame(
    {'feature': l_index,
     'importance' : v_index
    })
fi.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
4,org_id,0.288074
1,last_session_creation_time,0.234031
0,creation_time,0.228471
5,invited_by_user_id,0.148308
2,opted_in_to_mailing_list,0.030257
3,enabled_for_marketing_drip,0.020826
9,SIGNUP,0.01621
8,PERSONAL_PROJECTS,0.013626
10,SIGNUP_GOOGLE_AUTH,0.011584
11,invited_by_user,0.004258


Try a random forest classifier.

In [60]:
# Dvide the data into features and target
X = users.drop('adopted', axis=1)
y = users.iloc[:, -1]

In [61]:
# Scale the features
scaler = MinMaxScaler()
scaler.fit(X)
scaler.fit_transform(X)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


array([[0.99459973, 0.99719848, 1.        , ..., 0.        , 0.        ,
        1.        ],
       [0.52922646, 0.9958424 , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.48942447, 0.97266048, 0.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.99484974, 0.99752935, 1.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.95476066, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.97979899, 0.99191187, 0.        , ..., 1.        , 0.        ,
        0.        ]])

In [62]:
# Perform Grid-search
gsc = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid={
        'max_depth': range(3,9)
    },
    cv=5, verbose=0, n_jobs=-1
)


grid_result = gsc.fit(X, y)
best_params = grid_result.best_params_

rf = RandomForestClassifier(max_depth=best_params['max_depth'],
                           n_estimators=100,
                           random_state=False,
                           verbose=False)



In [63]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [64]:
rf.feature_importances_

array([0.20016954, 0.25325362, 0.02179251, 0.02743731, 0.20606151,
       0.16559441, 0.03673504, 0.03130727, 0.02218567, 0.00928735,
       0.01286293, 0.01331283])

In [65]:
for index, value in enumerate(rf.feature_importances_):
    print("{:30}{:f}".format(X.columns[index], value))

creation_time                 0.200170
last_session_creation_time    0.253254
opted_in_to_mailing_list      0.021793
enabled_for_marketing_drip    0.027437
org_id                        0.206062
invited_by_user_id            0.165594
GUEST_INVITE                  0.036735
ORG_INVITE                    0.031307
PERSONAL_PROJECTS             0.022186
SIGNUP                        0.009287
SIGNUP_GOOGLE_AUTH            0.012863
invited_by_user               0.013313


In [66]:
# Place feature importances in a sorted DataFrame
l_index = []
v_index = []
for index, value in enumerate(rf.feature_importances_):
    l_index.append(X.columns[index])
    v_index.append(value)
fi = pd.DataFrame(
    {'feature': l_index,
     'importance' : v_index
    })
fi.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
1,last_session_creation_time,0.253254
4,org_id,0.206062
0,creation_time,0.20017
5,invited_by_user_id,0.165594
6,GUEST_INVITE,0.036735
7,ORG_INVITE,0.031307
3,enabled_for_marketing_drip,0.027437
8,PERSONAL_PROJECTS,0.022186
2,opted_in_to_mailing_list,0.021793
11,invited_by_user,0.013313
