In [23]:
import pandas as pd
pd.set_option("display.max_columns", 500)
import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from ggplot import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split



In [2]:
users = pd.read_csv('~/Desktop/relax_challenge/takehome_users.csv', encoding='latin-1')
usage = pd.read_csv('~/Desktop/relax_challenge/takehome_user_engagement.csv')

### Data Wrangling/Exploratory Data Analysis

In [3]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [5]:
users['creation_time'] = pd.to_datetime(users['creation_time'])

In [6]:
users.dtypes

object_id                              int64
creation_time                 datetime64[ns]
name                                  object
email                                 object
creation_source                       object
last_session_creation_time           float64
opted_in_to_mailing_list               int64
enabled_for_marketing_drip             int64
org_id                                 int64
invited_by_user_id                   float64
dtype: object

In [7]:
usage.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [8]:
usage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [9]:
usage['time_stamp'] = pd.to_datetime(usage['time_stamp'])

In [10]:
usage.dtypes

time_stamp    datetime64[ns]
user_id                int64
visited                int64
dtype: object

In [11]:
usage['time_stamp'].min()

Timestamp('2012-05-31 08:20:06')

In [12]:
usage['time_stamp'].max()

Timestamp('2014-06-06 14:58:50')

#### Create adopted users

In [13]:
seven = datetime.timedelta(7)
adopted = {}

for user_id in sorted(list(usage['user_id'].unique())):
    adopted_user = False
    temp = usage[usage['user_id'] == user_id]
    temp = temp.sort_values('time_stamp')
    for row in temp.itertuples():
        if adopted_user == True:
            continue
        this_time = temp.get_value(row[0], 'time_stamp')
        if len(temp[(temp['time_stamp'] >= this_time) & (temp['time_stamp'] <= (this_time + seven))]) >= 3:
            adopted_user = True
    adopted[user_id] = adopted_user

  # This is added back by InteractiveShellApp.init_path()


In [14]:
users['adopted'] = False
for row in (users[np.logical_not(users['last_session_creation_time'].isnull())]).itertuples():
    adopted_value = adopted[users.get_value(row[0], 'object_id')]
    users.set_value(row[0], 'adopted', adopted_value)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [15]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,True
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,False
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False


In [16]:
unnecessary_features = ['creation_time', 'last_session_creation_time', 'name', 'email']
users.drop(unnecessary_features, axis=1, inplace=True)

In [17]:
users.head()

Unnamed: 0,object_id,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,1,GUEST_INVITE,1,0,11,10803.0,False
1,2,ORG_INVITE,0,0,1,316.0,True
2,3,ORG_INVITE,0,0,94,1525.0,False
3,4,GUEST_INVITE,0,0,1,5151.0,False
4,5,GUEST_INVITE,0,0,193,5240.0,False


In [18]:
users_encoded = pd.get_dummies(users, columns = ['creation_source'])

In [19]:
users_encoded.head()

Unnamed: 0,object_id,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH
0,1,1,0,11,10803.0,False,1,0,0,0,0
1,2,0,0,1,316.0,True,0,1,0,0,0
2,3,0,0,94,1525.0,False,0,1,0,0,0
3,4,0,0,1,5151.0,False,1,0,0,0,0
4,5,0,0,193,5240.0,False,1,0,0,0,0


### Classification of Adopted Users

In [24]:
X = users_encoded[['opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id', 'creation_source_GUEST_INVITE', 'creation_source_ORG_INVITE', 'creation_source_PERSONAL_PROJECTS', 'creation_source_SIGNUP', 'creation_source_SIGNUP_GOOGLE_AUTH']]
feat_labels = ['opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id', 'creation_source_GUEST_INVITE', 'creation_source_ORG_INVITE', 'creation_source_PERSONAL_PROJECTS', 'creation_source_SIGNUP', 'creation_source_SIGNUP_GOOGLE_AUTH']
y = users_encoded['adopted']

# Preprocess with Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_imp = imp.fit_transform(X)

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.2, random_state=42)


In [25]:
clf_rf = RandomForestClassifier(n_estimators=10, random_state=12)

In [26]:
clf_rf.fit(X_train, y_train)
clf_rf.score(X_test, y_test)

0.80791666666666662

In [27]:
for feature in zip(feat_labels, clf_rf.feature_importances_):
    print(feature)

('opted_in_to_mailing_list', 0.011684331111265819)
('enabled_for_marketing_drip', 0.014751961216795265)
('org_id', 0.61224782385980536)
('invited_by_user_id', 0.34147547589487082)
('creation_source_GUEST_INVITE', 0.003504348412163493)
('creation_source_ORG_INVITE', 0.0031102936055754293)
('creation_source_PERSONAL_PROJECTS', 0.0077434125674461229)
('creation_source_SIGNUP', 0.0022406872691691894)
('creation_source_SIGNUP_GOOGLE_AUTH', 0.0032416660629084339)
