In [229]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [14]:
user_engagement_path = '/Users/babyhandzzz/Desktop/ELEPH@NT/Datasets/relax_challenge/takehome_user_engagement.csv'
users_path = '/Users/babyhandzzz/Desktop/ELEPH@NT/Datasets/relax_challenge/takehome_users.csv'
users = pd.read_csv(users_path,encoding='ISO-8859-1',parse_dates=True)

In [15]:
def load_user_engagement(path):
    df = pd.read_csv(path, infer_datetime_format=True)
    df['time_stamp'] = pd.to_datetime(df.time_stamp)    
    df['date'] = df['time_stamp'].dt.date
    df.set_index('time_stamp',inplace=True)
    return df
    
user_engagement = load_user_engagement(user_engagement_path)    

# Generating labels for user calssification

* How to find out if the user is "adopted" or not?
* "Defining an 'adopted user' as a user who has logged into the product on three separate days in at least one seven-­day period..."

1.After going through the data it looks like there are no instances of multiple logins on a single day. 

2.This means that one day in observations correlates to exactly one login.

3.So the data can be aggregated with pandas resample (count) method on a 7 days basis. 

4.The resulting aggregation has counts of all logins per any 7-day period.

5.If any one of those periods has a count of 3 or more, the user is considred to be "adopted". (According to the definition from the assignment pdf)

Output list below proves that any given day only has one login recorded.
Number of unique dates for one user is equal to number of entries.

In [17]:
list_ = []

for i in unique_id:
    df = user_engagement[user_engagement.user_id==i]
    true_false = len(df.date.unique()) == df.shape[0] 
    list_.append(true_false)

print(np.unique(np.array(list_)))    

[ True]


# Actually generating labels

In [28]:
def generate_user_status_table():
    
    user_id = []
    target = []
    unique_id = np.unique(np.array(user_engagement.user_id)) # all of the unique users' ids

    for user in unique_id:
    
        user_id.append(user)

        logins = user_engagement[user_engagement.user_id==user].resample('7D').count()
        logins = logins['visited']
        
        if len(logins)>1:
            logins = max(logins)
            if logins >= 3:
                target.append(1) # append 1 if any given 7 day period has 3 logins
            else:
                target.append(0) # append 0 if there are no 3 logins within any number of 7-day activity periods
        else:
            target.append(0) # append 0 if there are not enough days for a 7-day period construction
        
        label_df = pd.DataFrame({'object_id':user_id, 'label':target})
        
    return label_df

label_df = generate_user_status_table()    

# What fraction of users is engaging with the service?

In [177]:
#What fraction of users had at least one login?:
print("Fraction of users that logged-in at least once: {}".format(label_df.shape[0]/users.shape[0]))

Fraction of users that logged-in at least once: 0.73525


In [75]:
#Out of those users who had at least one login, what fraction is classified as "adopted"/"not adopted"?

#0 - user is not adopted
#1 - user is adopted """

label_df.label.value_counts(normalize=True)

0    0.83407
1    0.16593
Name: label, dtype: float64

In [191]:
# Dataframe that contains users and labels
labeled_users = label_df.merge(users,on='object_id',how='left')

# Preparig the resulting dataframe for modeling:

1. Dropping:
    * creation_time
    * name 
    * email 
    * last_session_creation_time

2. Encodind:
    * creation_source - one-hot
    * org_id - one-hot
    * ivited_by_user_id - binary

3. Already encoded features:
    * opted_in_to_mailing_list
    * enabled_for_marketing_drip

In [193]:
features_to_keep = [ 'creation_source', 'org_id', 'opted_in_to_mailing_list','enabled_for_marketing_drip','invited_by_user_id', 'label']
labeled_users = labeled_users[features_to_keep]

In [226]:
def prepare_data_for_ml(df):
    # was a user invited or not? (if user is not invited there is a NaN)
    df['invited_by_user_id'].fillna(0,inplace=True)
    df['invited_by_user_id'] = np.where(df['invited_by_user_id'] == 0, 0, 1)
    # 
    ohe_org_id = pd.get_dummies(df['org_id'],drop_first=True)
    # how can I decrease memory use here? deleting dataframes?
    ohe_creation_source = pd.get_dummies(df['creation_source'],drop_first=True)
    df = pd.concat([ohe_org_id,ohe_creation_source, df[df.columns[2:]]],axis=1)

    return df

ml_data = prepare_data_for_ml(labeled_users)    

In [241]:
y = ml_data[[ml_data.columns[-1]]]
X = ml_data[ml_data.columns[:-1]]

In [242]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [243]:
xgb = GradientBoostingClassifier(n_estimators=100,max_depth=3)
xgb.fit(X_train,y_train)

GradientBoostingClassifier()

In [244]:
train_score = xgb.score(X_train,y_train)
test_score = xgb.score(X_test,y_test)

print('Train score: {}'.format(train_score))
print('Test score: {}'.format(test_score))
print('Classification Report')
print(classification_report(y_test, xgb.predict(X_test)))

Train score: 0.8418203349687025
Test score: 0.8241758241758241
Classification Report
              precision    recall  f1-score   support

           0       0.83      0.99      0.90      2412
           1       0.17      0.01      0.01       500

    accuracy                           0.82      2912
   macro avg       0.50      0.50      0.46      2912
weighted avg       0.71      0.82      0.75      2912



# There is a huge class imbalance, next step is to find a work around it.