# Relax Challenge

In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from datetime import datetime

In [35]:
#Set maximum number of rows,columns to be shown. Maximum column width made to 500.
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.notebook_repr_html', True)

In [2]:
user_engagement=pd.read_csv("takehome_user_engagement.csv",encoding = 'utf8')

In [5]:
users=pd.read_csv("takehome_users.csv",encoding = 'latin')

In [7]:
# Preview the fiels
user_engagement.head(5)

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [8]:
users.head(5)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [10]:
display(users.info())
display(user_engagement.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


None

In [11]:
# We will convert tatetime values appropriately
users['creation_time'] = pd.to_datetime(users['creation_time'])
display(users.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(4), object(3)
memory usage: 937.6+ KB


None

In [13]:
user_engagement.describe()

Unnamed: 0,user_id,visited
count,207917.0,207917.0
mean,5913.314197,1.0
std,3394.941674,0.0
min,1.0,1.0
25%,3087.0,1.0
50%,5682.0,1.0
75%,8944.0,1.0
max,12000.0,1.0


As the visited field is always 1 it can be deleted. The time_Stamp field shows the visited timestamp.

In [15]:
user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'])
user_engagement['date']= user_engagement['time_stamp'].apply(lambda x: x.date())
user_engagement.drop(['visited','time_stamp'], axis=1, inplace=True)

In [16]:
display(user_engagement.info())
display(user_engagement.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 2 columns):
user_id    207917 non-null int64
date       207917 non-null object
dtypes: int64(1), object(1)
memory usage: 3.2+ MB


None

Unnamed: 0,user_id,date
0,1,2014-04-22
1,2,2013-11-15
2,2,2013-11-29
3,2,2013-12-09
4,2,2013-12-25


In [17]:
# Identify the Adopted Users
from datetime import datetime, timedelta

def adopted_user(x):
    """
    Takes property object x
    converts x to list
    returns 1 if a user logged into the product on three separate
    days in at least one seven day period 
    """
    if len(x) >= 3:  # condition to eliminate if user has less than 3 time stamps
        x = [i for i in x]  # convert property object to list
        x.sort()  # sort the dates in increasing order
        x = [x[i+1] - x[i] for i in range(len(x)-2)]  # compute cumulative difference of current and next day
        # sum i, i+1, i+2 terms and check if it`s less than 7 which gives if it`s in a seven day period
        x = [1 for i in range(len(x)-2) if x[i] + x[i+1] + x[i+2] <= timedelta(days=7)]
        # condition to check if there`s a 1 in x, return 1 if true
        if 1 in x:
            return 1

adopted_df = user_engagement.groupby('user_id').agg(adopted_user)  # group by user_id and aggregate using custom function
adopted_df.fillna(0, inplace=True)  # fill null values with 0
adopted_df.columns = ['adopted_user']
adopted_df.head()

Unnamed: 0_level_0,adopted_user
user_id,Unnamed: 1_level_1
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0


In [20]:
# Find the number of user counts and adopted user counts

print('Total number of users:', len(users))
print('Number of adopted users:', len(adopted_df[adopted_df['adopted_user'] == 1]))

Total number of users: 12000
Number of adopted users: 1322


In [21]:
# now, lets build one dataset involving users dataset and the class variable.

users_adopted = users.join(adopted_df, how='left')
users_adopted.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,0.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0.0


# Feature Engineering
1. The personalized fields won't have any predictive values. They can be removed. <br>
2. The Null values in adopterd_user field can be made 0. <br>
3. Create a field usage as the difference between last_Session_creation_time and creation_time. Remove original fields. <br>


In [29]:
users_adopted = users_adopted.drop(['name', 'email'], axis=1)
users_adopted['adopted_user'] = users_adopted['adopted_user'].fillna(0)

users_adopted.dropna(axis=0, inplace=True)

users_adopted['last_session_creation_time'] = users_adopted['last_session_creation_time'].map(lambda x: datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))
# string to datetime
users_adopted['last_session_creation_time'] = pd.to_datetime(users_adopted['last_session_creation_time'])
# creation time string to datetime
users_adopted['creation_time'] = pd.to_datetime(users_adopted['creation_time'])

# create a column usage
users_adopted['usage'] = users_adopted['last_session_creation_time'] - users_adopted['creation_time']
# drop the time columns
users_adopted.drop(['creation_time', 'last_session_creation_time'], axis=1, inplace=True)



In [31]:
# One hot vectorization for creation source
source_ohe = pd.get_dummies(users_adopted['creation_source'])
users_adopted = users_adopted.join(source_ohe)
users_adopted.drop(['creation_source'], axis=1, inplace=True)

In [32]:
# Drop all usage less than zero and convert to unix timestamp.
users_adopted = users_adopted[users_adopted['usage'] >= timedelta(days=0)]
users_adopted['usage'] = users_adopted['usage'].map(lambda x: x.total_seconds())

In [37]:
display(users_adopted.head())

Unnamed: 0,object_id,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user,usage,GUEST_INVITE,ORG_INVITE
1,2,0,0,1,316.0,0.0,11736000.0,0,1
3,4,0,0,1,5151.0,0.0,72000.0,1,0
4,5,0,0,193,5240.0,0.0,414000.0,1,0
5,6,0,0,197,11241.0,0.0,154800.0,1,0
9,10,1,1,318,4143.0,0.0,43444800.0,0,1


In [60]:
y = users_adopted['adopted_user']
X = users_adopted.drop(['adopted_user'], axis=1)
X = X.drop(['object_id'], axis=1) # Remove the User ID field

In [61]:
# Feature Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
scaler.fit_transform(X)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


array([[0.00000000e+00, 0.00000000e+00, 2.40963855e-03, ...,
        1.85486179e-01, 0.00000000e+00, 1.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.40963855e-03, ...,
        5.72311566e-05, 1.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 4.65060241e-01, ...,
        5.49419104e-03, 1.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 9.78313253e-01, ...,
        1.01699765e-01, 0.00000000e+00, 1.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.65060241e-01, ...,
        3.71430207e-02, 1.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 6.02409639e-02, ...,
        5.72311566e-05, 0.00000000e+00, 1.00000000e+00]])

In [42]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, precision_recall_curve
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn import decomposition

In [62]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [63]:
features=xtrain.columns

In [64]:
# Decision Tree
clf = DecisionTreeClassifier()
clf.fit(xtrain, ytrain)
ypred=clf.predict(xtest)

print("{:20}{:3f}".format('F1 score', f1_score(ytest, ypred)))
print("{:20}{:3f}".format('Test accuracy', accuracy_score(ytest, ypred)))
print()

print('====Confusion Matrix=====')
print(confusion_matrix(ytest, ypred))

print('=====Feature importance=======')
for idx, val in enumerate(clf.feature_importances_):
    print("{:20}{:3f}".format(features[idx], val))
print('-----------------------------------------------')

F1 score            0.130435
Test accuracy       0.791883

====Confusion Matrix=====
[[746 110]
 [ 90  15]]
opted_in_to_mailing_list0.048068
enabled_for_marketing_drip0.016904
org_id              0.296027
invited_by_user_id  0.358351
usage               0.244526
GUEST_INVITE        0.009854
ORG_INVITE          0.026270
-----------------------------------------------


In [65]:
# Logistics Regression
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
ypred=lr.predict(xtest)

print("{:20}{:3f}".format('F1 score', f1_score(ytest, ypred)))
print("{:20}{:3f}".format('Test accuracy', accuracy_score(ytest, ypred)))
print()

print('====Confusion Matrix=====')
print(confusion_matrix(ytest, ypred))

F1 score            0.000000
Test accuracy       0.890739

====Confusion Matrix=====
[[856   0]
 [105   0]]


  'precision', 'predicted', average, warn_for)


In [67]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(xtrain, ytrain)
ypred=rf.predict(xtest)

print("{:20}{:3f}".format('F1 score', f1_score(ytest, ypred)))
print("{:20}{:3f}".format('Test accuracy', accuracy_score(ytest, ypred)))
print()

print('====Confusion Matrix=====')
print(confusion_matrix(ytest, ypred))

print('=====Feature importance=======')
for idx, val in enumerate(rf.feature_importances_):
    print("{:20}{:3f}".format(features[idx], val))
print('-----------------------------------------------')

F1 score            0.034188
Test accuracy       0.882414

====Confusion Matrix=====
[[846  10]
 [103   2]]
opted_in_to_mailing_list0.016484
enabled_for_marketing_drip0.013117
org_id              0.324581
invited_by_user_id  0.365810
usage               0.259302
GUEST_INVITE        0.010803
ORG_INVITE          0.009903
-----------------------------------------------




<b> Conclusion </b> <br>
From my analysis, I found that Logistics Regression and Random Forest had comparable performance. However, I would choose 
Random Forest as the best model for classifying the adopted users because it is an ensembles algorithm and it would generally work well.
<br>
The important factors classifying the adopted users are as follows: <br>
(1) Their Organization: Group of Users they belong to
(2) Usage

Hence, I would suggest Relax Inc to encourage formation of active groups and try to retain users for long time. 
Both of these factos will help in retention of adopted users.