# I. Exploratory Data Analysis
----------------------------------------
User table ("takehome_users") - data on 12,000 users who signed up for the product in the last two years.

In [168]:
#import packages
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Print Users Dataset
users_df = pd.read_csv('takehome_users.csv', engine='python') 
users_df.head(12)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,4/22/14 3:53,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,11/15/13 3:45,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,3/19/13 23:14,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,5/21/13 8:09,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,1/17/13 10:14,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0
5,6,12/17/13 3:37,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,1387424000.0,0,0,197,11241.0
6,7,12/16/12 13:24,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,1356010000.0,0,1,37,
7,8,7/31/13 5:34,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,,1,1,74,
8,9,11/5/13 4:04,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,,0,0,302,
9,10,1/16/13 22:08,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,1401833000.0,1,1,318,4143.0


Usage summary table ("takehome_user_engagement") - row for each day that a user logged into the product.

In [169]:
# Print User Engagement Dataset
engagement_df = pd.read_csv('takehome_user_engagement.csv')
engagement_df.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [170]:
engagement_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [171]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


There are no missing values for the engagement dataframe. In the users dataframe, there are 3177 missing values in the column 'last_session_creation_time' and 5583 missing entries in 'invited_by_user_id'.  

An "adopted" user has logged into the product on three separate days in at least one seven-day period.

In [172]:
# Convert object to datetime
engagement_df['time_stamp'] = pd.to_datetime(engagement_df['time_stamp'])
# print(engagement_df.info())

# Set index to 'time_stamp'
engagement_df = engagement_df.set_index('time_stamp')
engagement_df

Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2,1
2013-12-31 03:45:04,2,1
2014-01-08 03:45:04,2,1
2014-02-03 03:45:04,2,1
2014-02-08 03:45:04,2,1
2014-02-09 03:45:04,2,1


In [173]:
#List unique values in the engagement_df['user_id'] column
unique_users = engagement_df['user_id'].unique()
print(unique_users)

[    1     2     3 ..., 11998 11999 12000]


There are 12,000 unique user id's

In [174]:
#  Select out users in the engagement data who have logged in on three seperate days

# 1) Max and min dates (engagement_df)
# 2) Create 1 week bins
# 3) Loop through dates and select users ('user_id') with 3 or more visits on separate days('visited' ) that falls within a one week range

In [175]:
# Print range of dates of when users were observed
print('min:', engagement_df.index.min())
print('max:', engagement_df.index.max())

min: 2012-05-31 08:20:06
max: 2014-06-06 14:58:50


#### Create new dataframe of users who have logged in on three seperate days in at least one seven-day period:

In [176]:
# Group the datapoints by periods of 1 week and by user_id
adopted_user_df = engagement_df.groupby(['user_id', pd.TimeGrouper(freq='7D')]).sum()

# Filter users that have 3 or more visits in one week period
adopted_user_df = adopted_user_df.loc[adopted_user_df.visited >= 3, :]

# Reset index
adopted_user_df = adopted_user_df.reset_index()

# Print Dataframe
adopted_user_df.head(10)

Unnamed: 0,user_id,time_stamp,visited
0,2,2014-02-06 08:20:06,3
1,10,2013-02-14 08:20:06,3
2,10,2013-02-28 08:20:06,3
3,10,2013-03-14 08:20:06,3
4,10,2013-04-11 08:20:06,4
5,10,2013-04-25 08:20:06,4
6,10,2013-05-02 08:20:06,5
7,10,2013-05-09 08:20:06,3
8,10,2013-05-23 08:20:06,4
9,10,2013-06-06 08:20:06,4


In [177]:
# Create separate Dataframe for all unique "adopted" users
adoption_df = pd.DataFrame(adopted_user_df['user_id'].unique(), columns={"object_id":"0"})
adoption_df['user_status'] = 'Active'
adoption_df.head()

Unnamed: 0,object_id,user_status
0,2,Active
1,10,Active
2,42,Active
3,43,Active
4,53,Active


In [178]:
# Merge adoption_df and users_df
active_user_df = pd.merge(left=users_df, right=adoption_df, on=None, how='outer')
active_user_df['user_status'] = active_user_df['user_status'].fillna(value='Inactive')
active_user_df

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_status
0,1,4/22/14 3:53,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1.398139e+09,1,0,11,10803.0,Inactive
1,2,11/15/13 3:45,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1.396238e+09,0,0,1,316.0,Active
2,3,3/19/13 23:14,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1.363735e+09,0,0,94,1525.0,Inactive
3,4,5/21/13 8:09,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1.369210e+09,0,0,1,5151.0,Inactive
4,5,1/17/13 10:14,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1.358850e+09,0,0,193,5240.0,Inactive
5,6,12/17/13 3:37,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,1.387424e+09,0,0,197,11241.0,Inactive
6,7,12/16/12 13:24,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,1.356010e+09,0,1,37,,Inactive
7,8,7/31/13 5:34,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,,1,1,74,,Inactive
8,9,11/5/13 4:04,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,,0,0,302,,Inactive
9,10,1/16/13 22:08,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,1.401833e+09,1,1,318,4143.0,Active


In [179]:
# Drop name and email column
active_user_df = active_user_df.drop(['creation_time','name', 'email'], axis=1)
print(active_user_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 0 to 11999
Data columns (total 8 columns):
object_id                     12000 non-null int64
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
user_status                   12000 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 843.8+ KB
None


In [180]:
#Drop NaNs
active_user_df = active_user_df.dropna()
active_user_df

Unnamed: 0,object_id,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_status
0,1,GUEST_INVITE,1.398139e+09,1,0,11,10803.0,Inactive
1,2,ORG_INVITE,1.396238e+09,0,0,1,316.0,Active
2,3,ORG_INVITE,1.363735e+09,0,0,94,1525.0,Inactive
3,4,GUEST_INVITE,1.369210e+09,0,0,1,5151.0,Inactive
4,5,GUEST_INVITE,1.358850e+09,0,0,193,5240.0,Inactive
5,6,GUEST_INVITE,1.387424e+09,0,0,197,11241.0,Inactive
9,10,ORG_INVITE,1.401833e+09,1,1,318,4143.0,Active
12,13,ORG_INVITE,1.396196e+09,0,0,254,11204.0,Inactive
16,17,GUEST_INVITE,1.397314e+09,1,0,175,1600.0,Inactive
21,22,ORG_INVITE,1.392012e+09,0,0,7,2994.0,Inactive


## II. Predictive Modeling
-----------------
Identify which factors predict future user adoption.

### Logistic Regression:

1) Creating Dummy Variables for Feature Selection

In [181]:
# Dummify all columns with categorical features 
dummy_cols_convert = pd.get_dummies(active_user_df, drop_first=True)
dummy_cols_convert

Unnamed: 0,object_id,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,creation_source_ORG_INVITE,user_status_Inactive
0,1,1.398139e+09,1,0,11,10803.0,0,1
1,2,1.396238e+09,0,0,1,316.0,1,0
2,3,1.363735e+09,0,0,94,1525.0,1,1
3,4,1.369210e+09,0,0,1,5151.0,0,1
4,5,1.358850e+09,0,0,193,5240.0,0,1
5,6,1.387424e+09,0,0,197,11241.0,0,1
9,10,1.401833e+09,1,1,318,4143.0,1,0
12,13,1.396196e+09,0,0,254,11204.0,1,1
16,17,1.397314e+09,1,0,175,1600.0,0,1
21,22,1.392012e+09,0,0,7,2994.0,1,1


2) Feature Selection using Dummy Variables

In [182]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Instantiate a Logistic Regression Classifier (logreg)
logreg = LogisticRegression()

# Choose X = feature and Y = target
Y = dummy_cols_convert['user_status_Inactive']
X_all = dummy_cols_convert.drop('user_status_Inactive', axis=1)

# RFE to help find most important features
rfe = RFE(logreg, 3)
rfe = rfe.fit(X_all, Y)
print(rfe.support_) # support returns boolean with top 3 features
print(rfe.ranking_) # ranking returns an integer with 1 being a top feature

# Get column names (features) from DataFrame, dummy_cols_convert
col_names = X_all.columns.values

# Print best feature names
best_features_raw = rfe.support_ * col_names

# Create list with best feature names
best_features = []

for best_feature in best_features_raw:
    if len(best_feature) > 0:
        best_features.append(best_feature)
        
print('Best Predictors for User Adoption: {}'.format(best_features))

[ True  True False False False  True False]
[1 1 4 5 2 1 3]
Best Predictors for User Adoption: ['object_id', 'last_session_creation_time', 'invited_by_user_id']


## III.  Conclusion
------------------------
- 1439 out of the 12,000 users who signed up in the last two years remained adopted users. An adopted user is defined as a user who has logged into the product on three seperate days in at least one seven-day period.
- Using the Recursive Feature Elimination method, the best features to predict future user adoption are: 

    1) User's id
    
    2) Last login time by user
    
    3) Which user invited them to join

## IV. Recommendations
------------------------
Collect other user information such as gender, age, and location