# Relax Data Science Challenge

---

In [1]:
import pandas as pd
import numpy as np

from datetime import datetime as dt

Goal: Identify which factors predict future user adoption. <i><b>An adopted user is a user who has logged into the product on three separate days in at least one seven-day period. </b></i>

In [2]:
# Import takehome_users.csv
user_info_df = pd.read_csv('takehome_users.csv', encoding='latin-1')
user_info_df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [3]:
user_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [4]:
# Clean the DataFrame
user_info_df = user_info_df.drop(columns=['name', 'email'])
# Fill NA values for last session created to 0 which is fine since there is no recorded time for a last session
user_info_df['last_session_creation_time'] = user_info_df['last_session_creation_time'].fillna(0)
# Fill NA values for invited by user id w/ random numbers not associated w a user id
user_info_df['invited_by_user_id'] = user_info_df['invited_by_user_id'].fillna(np.random.randint(max(user_info_df.object_id)+1, 
                                                                                                 100000000))
# Change the creation time column into a series of datetime objects
user_info_df['creation_time'] = pd.to_datetime(user_info_df['creation_time'])
user_info_df.head()

Unnamed: 0,object_id,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [5]:
# Import and organize the user engagement data
user_engagement_df = pd.read_csv('takehome_user_engagement.csv', index_col=['user_id'])
user_engagement_df['time_stamp'] = pd.to_datetime(user_engagement_df['time_stamp'])
user_engagement_df = user_engagement_df.sort_values(by=['user_id', 'time_stamp'])
user_engagement_df.head()

Unnamed: 0_level_0,time_stamp,visited
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2014-04-22 03:53:30,1
2,2013-11-15 03:45:04,1
2,2013-11-29 03:45:04,1
2,2013-12-09 03:45:04,1
2,2013-12-25 03:45:04,1


In [6]:
# Gather a list of all the adopted users from the user engagement data
t = pd.Timedelta('168:00:00')
adopted_user_list = []
curr_idx = user_engagement_df.index[0]
prev_date = user_engagement_df.iloc[user_engagement_df.index[0], 0]
visit_count = 1
for idx, row in user_engagement_df.iterrows():
    # If the adopted user has been added to the list, continue until you reach the next user
    if visit_count == 3 and idx == curr_idx:
        continue
    # If there's a new index, update current index and reset the prev date and visit count
    if idx != curr_idx:
        curr_idx = idx
        prev_date = row['time_stamp']
        visit_count = 1
        continue
    # If the user visited in the past 7 days, add 1 to the visit count
    if row['time_stamp'] - prev_date <= t:
        visit_count += 1
    # If the user visited again outside the 7 day window, reset the visit count to 1 and reset prev_date
    elif row['time_stamp'] - prev_date > t:
        prev_date = row['time_stamp']
        visit_count = 1
    # If the user visited 3 times in one week, add the user to the adopted list
    if visit_count == 3:
        adopted_user_list.append(curr_idx)

In [7]:
# Add a column to show who is an adopted user and who isn't from the take home user dataframe
user_info_df['adopted'] = user_info_df['object_id'].isin(adopted_user_list).astype(int)
user_info_df.head()

Unnamed: 0,object_id,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,1,2014-04-22 03:53:30,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0


In [8]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder

In [9]:
cat_feature_list = ['creation_source', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip',
                   'org_id', 'invited_by_user_id']
X = pd.DataFrame([])

# Convert all the values from the categorical feature variables into strings
for cat_feature in cat_feature_list:
    X[cat_feature] = user_info_df[cat_feature].astype(str)

# Label Encode the strings before running the Chi-Squared Test
le = LabelEncoder()
X = X.apply(le.fit_transform)
y = user_info_df['adopted']
chi2_res, pval = chi2(X, y)

# Expand the chi2 values to zipped original indices and chi-squared values
zip_chi2 = []
for (og_idx, chi) in enumerate(chi2_res):
    zip_chi2.append((og_idx, chi))

# Sort the chi-squared list
sort_chi2 = sorted(zip_chi2, key=lambda x:x[1], reverse=True)

# List the three most important categorical features
cat_list = []
most_important_cat = sort_chi2[:3]
for cat in most_important_cat:
    cat_list.append(cat_feature_list[cat[0]])
print(f'The three most important feature columns {cat_list[0]}, {cat_list[1]}, and {cat_list[2]}.')

The three most important feature columns org_id, invited_by_user_id, and opted_in_to_mailing_list.


The most important features in this dataset for future user adoption are ```org_id```, ```invited_by_user_id```, and ```opted_in_to_mailing_list```. We derived these features by using a chi-squared test on categorical features. The chi-squared test is a test for independence between a variable and the target. A chi-squared statistic is a measure of distance between the actual value of a predicted outcome to its expected value given that the predictor variable is independent of the outcome. Smaller chi-squared values signify that the predictor is independent of the target and has little effect on what the target value is. Higher chi-squared values signify dependence, which means that the variables affect the outcome of the target.