# Relax Challenge

Defining an "adopted user" as a user who has logged into the product on three separate days in at least one seven-day period, identify which factors predict future user adoption.

In [1]:
from datetime import datetime
from dateutil.parser import parse
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='darkgrid')
%matplotlib inline

  import pandas.util.testing as tm


## users dataframe

In [2]:
df = pd.read_csv('takehome_users.csv', encoding='cp1252') # import data
users = df.copy() # save a copy as users

In [3]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


Takehome users column names and descriptions:
- name: the user's name
- object_id: the user's id
- email: email address
- creation_source: how their account was created. This takes on one of 5 values: 
    - PERSONAL_PROJECTS: invited to join another user's personal workspace
    - GUEST_INVITE: invited to an organization as a guest (limited permissions)
    - ORG_INVITE: invited to an organization (as a full member)
    - SIGNUP: signed up via the website
    - SIGNUP_GOOGLE_AUTH: signed up using Google Authentication (using a Google email account for their login id)
- creation_time: when they created their account
- last_session_creation_time: unix timestamp of last login
- opted_in_to_mailing_list: whether they have opted into receiving marketing emails
- enabled_for_marketing_drip: whether they are on the regular marketing email drip
- org_id: the organization (group of users) they belong to
- invited_by_user_id: which user invited them to join (if applicable)

In [5]:
# rename object_id to user_id (since it's the user_id)
users = users.rename(columns={'object_id': 'user_id'})

users.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


## engagement dataframe

In [6]:
data = pd.read_csv('takehome_user_engagement.csv', encoding='utf-8') # import data
engage = data.copy() # save a copy as engage

In [7]:
engage.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [8]:
engage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


# finding adopted users

Using the user engagement csv file, we need to identify which users logged into the system least 3 times a week on separate days in any given week. We will create a new column in the user dataframe that labels Yes or No based on their fulfillment of the adopted user criteria.

We start by splitting the time stamp into date and time and keeping the date since we are looking for at least 3 unique dates. Then, we'll look for users who have less than 3 unique logins, remove them from the dataframe, and give them a No label under the new adopted user column.

In [9]:
# split time_stamp string
engage['date'] = [x.split(' ')[0] for x in engage.time_stamp]

# convert date from engage dataframe to timestamp datatype using parse
engage['date'] = [parse(x) for x in engage.date]

# remove time_stamp column
engage = engage.drop(columns=['time_stamp'])

# confirm changes
engage.head()

Unnamed: 0,user_id,visited,date
0,1,1,2014-04-22
1,2,1,2013-11-15
2,2,1,2013-11-29
3,2,1,2013-12-09
4,2,1,2013-12-25


In [10]:
# add week number
engage['week'] = engage['date'].dt.week

engage.head()

Unnamed: 0,user_id,visited,date,week
0,1,1,2014-04-22,17
1,2,1,2013-11-15,46
2,2,1,2013-11-29,48
3,2,1,2013-12-09,50
4,2,1,2013-12-25,52


In [11]:
# count how many unique times a user has logged in
logins = []

for x in engage.user_id.unique():
    counter = len(engage[engage.user_id == x]['date'].unique())
    logins.append([x, counter]) # user ID, login count
    
# remove users with less than 3 logins
less_than_3 = [] # save IDs in list

for x in logins:
    if x[1] < 3:
        less_than_3.append(x[0])

In [12]:
# remove rows with IDs from less_than_3
for x in less_than_3:
    engage = engage[engage.user_id != x]

In [13]:
# removed many rows, need to reset index
engage = engage.reset_index(drop=True)

# look at how many times users logged in per week
# engage[engage.week == 1].groupby('user_id').sum()

In [14]:
STOP

NameError: name 'STOP' is not defined

In [28]:
# function to locate users who logged in 3x times / week
def adopted_users(week_no, user_list):
    week = engage[engage.week == week_no].drop_duplicates() # drop duplicates
    week = week.groupby('user_id').count().reset_index() # locate by user_id
    
    # isolate users with more than 3 visits
    for x in range(len(week)):
        if week.visited[x] > 2:
            user_list.append(week.user_id[x])
            
    # return user_list

In [39]:
# week = 1
# check with Counter(engage[engage.week == 1]['user_id'])
week1_keep = []
adopted_users(1, week1_keep)

In [40]:
# week = 2
# check with Counter(engage[engage.week == 2]['user_id'])
week2_keep = []
adopted_users(2, week2_keep)

In [41]:
# week = 3
# check with Counter(engage[engage.week == 3]['user_id'])
week3_keep = []
adopted_users(3, week3_keep)

In [42]:
# week = 4
# check with Counter(engage[engage.week == 4]['user_id'])
week4_keep = []
adopted_users(4, week4_keep)

In [None]:
# week = 5
# check with Counter(engage[engage.week == 5]['user_id'])
week5_keep = []
adopted_users(5, week5_keep)

In [None]:
# week = 6
# check with Counter(engage[engage.week == 6]['user_id'])
week6_keep = []
adopted_users(6, week6_keep)

In [None]:
# week = 7
# check with Counter(engage[engage.week == 5]['user_id'])
week7_keep = []
adopted_users(7, week7_keep)

## labeling adopted users

In [20]:
# create new adopted column on users dataframe
users['adopted'] = users['user_id']

# replace with No based on user_id
users['adopted'] = users['adopted'].replace(less_than_3, 'No')

In [44]:
# replace with Yes based on user_id
users['adopted'] = users['adopted'].replace(week1_keep, 'Yes')
users['adopted'] = users['adopted'].replace(week2_keep, 'Yes')
users['adopted'] = users['adopted'].replace(week3_keep, 'Yes')
users['adopted'] = users['adopted'].replace(week4_keep, 'Yes')

In [48]:
users.adopted.value_counts()

No       6575
Yes       841
8190        1
2329        1
2322        1
         ... 
4938        1
10259       1
4936        1
4935        1
8193        1
Name: adopted, Length: 4586, dtype: int64

In [46]:
week1_keep

[10,
 42,
 63,
 69,
 81,
 82,
 87,
 146,
 153,
 160,
 168,
 197,
 202,
 203,
 209,
 230,
 245,
 263,
 297,
 310,
 322,
 445,
 471,
 483,
 509,
 510,
 529,
 535,
 553,
 564,
 603,
 605,
 639,
 669,
 679,
 680,
 724,
 728,
 754,
 772,
 783,
 804,
 845,
 882,
 885,
 901,
 906,
 912,
 934,
 937,
 943,
 980,
 1017,
 1018,
 1027,
 1055,
 1061,
 1072,
 1094,
 1099,
 1128,
 1129,
 1155,
 1196,
 1202,
 1280,
 1320,
 1339,
 1343,
 1345,
 1350,
 1357,
 1368,
 1407,
 1411,
 1421,
 1434,
 1464,
 1472,
 1476,
 1479,
 1485,
 1507,
 1576,
 1617,
 1624,
 1665,
 1693,
 1709,
 1730,
 1754,
 1769,
 1781,
 1783,
 1786,
 1811,
 1821,
 1822,
 1836,
 1865,
 1879,
 1885,
 1941,
 2002,
 2011,
 2013,
 2033,
 2042,
 2074,
 2078,
 2080,
 2103,
 2113,
 2159,
 2171,
 2193,
 2246,
 2253,
 2270,
 2271,
 2286,
 2289,
 2300,
 2316,
 2324,
 2333,
 2339,
 2390,
 2406,
 2425,
 2447,
 2474,
 2490,
 2502,
 2519,
 2539,
 2548,
 2557,
 2566,
 2568,
 2571,
 2604,
 2620,
 2622,
 2647,
 2658,
 2672,
 2723,
 2727,
 2739,
 2744,
 2