# Relax Challenge

Relax Inc. makes productivity and project management software that's popular with both individuals and teams. Founded by several former Facebook employees, it's considered a great company to work for.


Define  an  "adopted  user"   as  a  user  who   has  logged  into  the  product  on  three  separate
days  in  at  least  one  seven­day  period ,  identify  which  factors  predict  future  user
adoption .

In [1]:
#Import necessary libraries
import pandas as pd
import numpy as np

# Plotting modules
import matplotlib.pyplot as plt
import seaborn as sns

# Analysing datetime
from datetime import datetime as dt
from datetime import timedelta

# File system manangement
import os,sys
from datetime import datetime
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

#Interactive Shell
from IPython.core.interactiveshell import InteractiveShell  
InteractiveShell.ast_node_interactivity = "all"

#Pandas profiling
from pandas_profiling import ProfileReport

import missingno as msno
import re 

%matplotlib inline

### Load Data

In [2]:
path = 'users_logged.csv'
df_logged = pd.read_csv(path) #parse_dates=True
df_logged.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [3]:
path = 'users_engagement.csv'
df_info = pd.read_csv(path,encoding='latin-1') #parse_dates=True
df_info.head(10)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0
5,6,2013-12-17 03:37:06,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,1387424000.0,0,0,197,11241.0
6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,1356010000.0,0,1,37,
7,8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,,1,1,74,
8,9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,,0,0,302,
9,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,1401833000.0,1,1,318,4143.0


___

## df_logged

In [4]:
df_logged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [5]:
df_logged['time_stamp'] = pd.to_datetime(df_logged['time_stamp']).astype("datetime64[ns]")

In [6]:
df_logged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   time_stamp  207917 non-null  datetime64[ns]
 1   user_id     207917 non-null  int64         
 2   visited     207917 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.8 MB


In [7]:
df_logged.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


___

## df_info

In [8]:
df_info.dtypes

object_id                       int64
creation_time                  object
name                           object
email                          object
creation_source                object
last_session_creation_time    float64
opted_in_to_mailing_list        int64
enabled_for_marketing_drip      int64
org_id                          int64
invited_by_user_id            float64
dtype: object

In [9]:
df_info['creation_time'] = pd.to_datetime(df_info['creation_time']).astype("datetime64[ns]")

In [10]:
df_info['domain'] = [x.split('@')[1] for x in df_info['email']]

In [11]:
df_info['domain'] = [x.replace('.com','') for x in df_info['domain']]

In [12]:
df_info.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,domain
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,yahoo
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,gustr
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,gustr
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,yahoo
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,yahoo


## Create adopted_user() Function

In [13]:
adopted_user = []

for user in df_info.object_id:
    
    keep = df_logged[df_logged['user_id'] == user]
    keep = keep.set_index('time_stamp')
    keep = keep.sort_index()
    
    
    keep = keep.rolling(window='7d').visited.sum().max()
    
    if keep >= 3:
        adopted_user.append(1)
    else:
        adopted_user.append(0)

df_info['adopted_user'] = adopted_user

In [14]:
df_info.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,domain,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,yahoo,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,gustr,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,gustr,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,yahoo,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,yahoo,0


# Preprocessing

In [15]:
df_info.select_dtypes(include=[object])

Unnamed: 0,name,email,creation_source,domain
0,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,yahoo
1,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,gustr
2,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,gustr
3,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,yahoo
4,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,yahoo
...,...,...,...,...
11995,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,gustr
11996,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,gmail
11997,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,cuvox.de
11998,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,iuxiw


In [16]:
df_info = df_info.drop(['name','creation_time','email','domain','last_session_creation_time','invited_by_user_id'],axis=1)

In [17]:
categorical_features = ['creation_source']

In [18]:
df_le1 = df_info.loc[:, categorical_features]

In [19]:
# PREPROCESSING
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder

le = LabelEncoder()

df_le1['creation_source'] = le.fit_transform(df_le1['creation_source'].astype(str))


dropped_cols = df_info.drop(columns= df_le1)

df_le = pd.concat([dropped_cols,df_le1],axis=1)

In [20]:
df_le.head()

Unnamed: 0,object_id,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,adopted_user,creation_source
0,1,1,0,11,0,0
1,2,0,0,1,1,1
2,3,0,0,94,0,1
3,4,0,0,1,0,0
4,5,0,0,193,0,0


In [21]:
ohe = OneHotEncoder(sparse=True, drop='first', handle_unknown='error')


encoded = pd.DataFrame(ohe.fit_transform(df_le[categorical_features]).toarray(), \
                                columns=ohe.get_feature_names(categorical_features))

dropped_cols = df_info.drop(columns=categorical_features)


#df_ohe = data_hot_encoded_drop.join(data_hot_encoded)
df_ohe = pd.concat([dropped_cols,encoded],axis=1)

df_ohe

Unnamed: 0,object_id,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,adopted_user,creation_source_1,creation_source_2,creation_source_3,creation_source_4
0,1,1,0,11,0,0.0,0.0,0.0,0.0
1,2,0,0,1,1,1.0,0.0,0.0,0.0
2,3,0,0,94,0,1.0,0.0,0.0,0.0
3,4,0,0,1,0,0.0,0.0,0.0,0.0
4,5,0,0,193,0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
11995,11996,0,0,89,0,1.0,0.0,0.0,0.0
11996,11997,0,0,200,0,0.0,0.0,0.0,1.0
11997,11998,1,1,83,0,0.0,0.0,0.0,0.0
11998,11999,0,0,6,0,0.0,1.0,0.0,0.0


# Modeling

In [22]:
from sklearn.model_selection import train_test_split 

X = df_ohe.drop(['adopted_user'], axis = 1)
y = df_ohe['adopted_user']

SEED = 42
TS = 0.25

# Create training and test sets
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size = TS, random_state=SEED, stratify=y)

print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

(9000, 8)
(9000,)
(3000, 8)
(3000,)


## Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
MODEL_PARAMS = {
    "n_estimators": 100,
    "criterion": 'gini',
    "max_features":'sqrt',
    "random_state":SEED
}

rf = RandomForestClassifier(**MODEL_PARAMS)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:,1]



rf_score = round(rf.score(X_train, y_train) * 100, 2)
rf_test_score = round(rf.score(X_test, y_test) * 100, 2)

print('Random Forest Training Score:', rf_score)
print('Random Forest Test Score:', rf_test_score)


#Checking performance on our model with ROC Score.
rf_roc_score = round(roc_auc_score(y_test,y_pred_proba)*100,3)
print("ROC Score:", rf_roc_score)

RandomForestClassifier(max_features='sqrt', random_state=42)

Random Forest Training Score: 99.99
Random Forest Test Score: 84.57
ROC Score: 52.181
