In [323]:
import pandas as pd

In [324]:
df1 = pd.read_csv('./takehome_user_engagement.csv')
df1.head()


Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [325]:
df2 = pd.read_csv('./takehome_users.csv', encoding = 'latin1')
df2.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [326]:
# Extracting the adopted users
df1['time_stamp'] = pd.to_datetime(df1.time_stamp)
df1 = df1.set_index('time_stamp')
adopted_user =df1.groupby(['user_id',pd.TimeGrouper('7D')]).count() 
adopted_user = adopted_user[adopted_user.visited >=3]
print ('the number of adopted users are -' ,len(adopted_user.reset_index().user_id.unique()))
adopted_user_list = adopted_user.reset_index().user_id.unique()




the number of adopted users are - 1439


In [327]:
# Combining the adopted user list with the main file. The purpose to combine is that we will get a new column called label
# if label is true, it is an adopted user and vice versa

In [328]:
#df.loc[df['column_name'].isin(some_values)]
df2['label'] = df2['object_id'].isin(adopted_user_list)

checking the column names

In [329]:
df2.columns

Index(['object_id', 'creation_time', 'name', 'email', 'creation_source',
       'last_session_creation_time', 'opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id', 'label'],
      dtype='object')

Looking on the column names, we do not require some of the features, they do not add any value to our feature space. Those features
are-              
1) creation_time    
2) name     
3) email     
4) last session creation time      

Hence, removing all these columns.

In [330]:
df2 = df2.drop(['creation_time', 'name', 'email','last_session_creation_time'], axis =1)
df2.head(20)

Unnamed: 0,object_id,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,label
0,1,GUEST_INVITE,1,0,11,10803.0,False
1,2,ORG_INVITE,0,0,1,316.0,True
2,3,ORG_INVITE,0,0,94,1525.0,False
3,4,GUEST_INVITE,0,0,1,5151.0,False
4,5,GUEST_INVITE,0,0,193,5240.0,False
5,6,GUEST_INVITE,0,0,197,11241.0,False
6,7,SIGNUP,0,1,37,,False
7,8,PERSONAL_PROJECTS,1,1,74,,False
8,9,PERSONAL_PROJECTS,0,0,302,,False
9,10,ORG_INVITE,1,1,318,4143.0,True


The data looks more clean now and ready to analyze

# Impact of feature "opted_in_to_mailing_list"

In [331]:
df_adopted = df2[df2.label == True]
df_nonadopted = df2[df2.label == False]

In [332]:
print('Pecentage of users Engaged who signed up for mailing list', 
      df_adopted.opted_in_to_mailing_list.sum()/len(df_adopted))

Pecentage of users Engaged who signed up for mailing list 0.256428075052


In [333]:
print('Pecentage of users NOT Engaged who signed up for mailing list', 
        df_nonadopted.opted_in_to_mailing_list.sum()/len(df_nonadopted))

Pecentage of users NOT Engaged who signed up for mailing list 0.248556007954


Both groups have same percenatge, so it does not have much impact

# Impact of feature "enabled_for_marketing_drip"

In [334]:
print('Pecentage of users Engaged who signed up for marketing drip', 
    df_adopted.enabled_for_marketing_drip.sum()/len(df_adopted))

Pecentage of users Engaged who signed up for marketing drip 0.152189020153


In [335]:
print('Pecentage of users NOT Engaged who signed up for marketing drip', 
        df_nonadopted.enabled_for_marketing_drip.sum()/len(df_nonadopted))

Pecentage of users NOT Engaged who signed up for marketing drip 0.148944228766


Both groups have same percenatge, so it does not have much impact

# Impact of feature "creation_source"

In [336]:
print('The adopted group')
print(df_adopted.creation_source.value_counts()/len(df_adopted))

The adopted group
ORG_INVITE            0.346769
GUEST_INVITE          0.228631
SIGNUP                0.187630
SIGNUP_GOOGLE_AUTH    0.141070
PERSONAL_PROJECTS     0.095900
Name: creation_source, dtype: float64


In [337]:
print('The NON adopted group')
df_nonadopted.creation_source.value_counts()/len(df_nonadopted)

The NON adopted group


ORG_INVITE            0.355553
PERSONAL_PROJECTS     0.186819
GUEST_INVITE          0.173658
SIGNUP                0.172048
SIGNUP_GOOGLE_AUTH    0.111921
Name: creation_source, dtype: float64

From here we can see that GUEST_INVITE might play a major role in engagement. We will verify this in next section

# Let's see if an organization has an impact on engagement

In [338]:
(df_adopted.org_id.value_counts()/len(df_adopted))

7      0.011119
1      0.009729
4      0.009729
9      0.009034
2      0.009034
3      0.009034
13     0.008339
62     0.007644
24     0.006949
0      0.006949
5      0.006949
10     0.006254
8      0.006254
44     0.006254
34     0.005559
63     0.005559
82     0.005559
58     0.005559
52     0.005559
218    0.005559
33     0.005559
20     0.005559
15     0.005559
6      0.005559
31     0.005559
11     0.005559
61     0.004864
35     0.004864
117    0.004864
16     0.004864
         ...   
367    0.000695
294    0.000695
197    0.000695
111    0.000695
125    0.000695
137    0.000695
143    0.000695
101    0.000695
153    0.000695
154    0.000695
158    0.000695
164    0.000695
351    0.000695
87     0.000695
86     0.000695
373    0.000695
276    0.000695
376    0.000695
204    0.000695
220    0.000695
223    0.000695
227    0.000695
229    0.000695
384    0.000695
244    0.000695
51     0.000695
253    0.000695
354    0.000695
272    0.000695
239    0.000695
Name: org_id, Length: 39

In [339]:
df_nonadopted.org_id.value_counts()/len(df_nonadopted)

0      0.029259
1      0.020737
2      0.017801
3      0.014677
4      0.013730
6      0.012309
5      0.011173
9      0.010510
7      0.009753
10     0.008995
8      0.008333
14     0.007859
17     0.006533
12     0.006533
18     0.006439
11     0.006344
16     0.006155
20     0.005681
13     0.005681
28     0.005492
23     0.005303
15     0.005303
25     0.005303
30     0.005208
22     0.005208
24     0.005018
21     0.004924
40     0.004829
26     0.004829
27     0.004734
         ...   
295    0.000947
392    0.000947
301    0.000947
381    0.000947
265    0.000947
405    0.000947
232    0.000947
415    0.000947
407    0.000947
355    0.000852
396    0.000852
345    0.000852
305    0.000852
294    0.000852
356    0.000852
364    0.000758
378    0.000758
270    0.000758
304    0.000758
315    0.000758
353    0.000758
399    0.000758
397    0.000663
352    0.000663
235    0.000663
386    0.000663
387    0.000568
395    0.000568
400    0.000473
416    0.000189
Name: org_id, Length: 41

# Implementing a Random Forest Classifer to see the most important features

In [340]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
# Converting categorical to dummy variables
temp = df2.creation_source
encoding = pd.get_dummies(temp)
df2 = pd.concat([df2, encoding], axis=1, join_axes=[df2.index])


In [341]:
target = df2.label
df2 = df2.drop(['org_id','invited_by_user_id','creation_source','label','object_id'], axis = 1)
df2.head()

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,GUEST_INVITE,ORG_INVITE,PERSONAL_PROJECTS,SIGNUP,SIGNUP_GOOGLE_AUTH
0,1,0,1,0,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,1,0,0,0,0
4,0,0,1,0,0,0,0


In [352]:
# implementing a random forest classifier
rfc = RandomForestClassifier()
rfc.fit(df2.values, target)

important_features = rfc.feature_importances_
important_features

array([ 0.07617328,  0.04559856,  0.11097564,  0.0872925 ,  0.50123671,
        0.05242398,  0.12629931])

In [353]:
columns = df2.columns
print('The most important features are -')
i =0
for each in columns:
    print('Feature name ', each, ', importance - ',important_features[i] )
    i =i+1

The most important features are -
Feature name  opted_in_to_mailing_list , importance -  0.0761732835566
Feature name  enabled_for_marketing_drip , importance -  0.045598561532
Feature name  GUEST_INVITE , importance -  0.110975642559
Feature name  ORG_INVITE , importance -  0.0872925041879
Feature name  PERSONAL_PROJECTS , importance -  0.501236714514
Feature name  SIGNUP , importance -  0.0524239804648
Feature name  SIGNUP_GOOGLE_AUTH , importance -  0.126299313185


As we can see from the above analyses that the most important features are 'Personal projects' and 'Guest Invite'