# Take Home User Assignment


In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [41]:
#data provided
users_df = pd.read_csv("takehome_users.csv", encoding="utf-8", encoding_errors="ignore")
print(users_df.head())

   object_id        creation_time               name  \
0          1  2014-04-22 03:53:30     Clausen August   
1          2  2013-11-15 03:45:04      Poole Matthew   
2          3  2013-03-19 23:14:52  Bottrill Mitchell   
3          4  2013-05-21 08:09:28    Clausen Nicklas   
4          5  2013-01-17 10:14:20          Raw Grace   

                        email creation_source  last_session_creation_time  \
0    AugustCClausen@yahoo.com    GUEST_INVITE                1.398139e+09   
1      MatthewPoole@gustr.com      ORG_INVITE                1.396238e+09   
2  MitchellBottrill@gustr.com      ORG_INVITE                1.363735e+09   
3   NicklasSClausen@yahoo.com    GUEST_INVITE                1.369210e+09   
4          GraceRaw@yahoo.com    GUEST_INVITE                1.358850e+09   

   opted_in_to_mailing_list  enabled_for_marketing_drip  org_id  \
0                         1                           0      11   
1                         0                           0       1   

In [43]:
engagement_df = pd.read_csv("takehome_user_engagement.csv")
print(engagement_df.head())

            time_stamp  user_id  visited
0  2014-04-22 03:53:30        1        1
1  2013-11-15 03:45:04        2        1
2  2013-11-29 03:45:04        2        1
3  2013-12-09 03:45:04        2        1
4  2013-12-25 03:45:04        2        1


In [45]:
print(engagement_df['time_stamp'].head())
print(engagement_df['time_stamp'].dtype)


0    2014-04-22 03:53:30
1    2013-11-15 03:45:04
2    2013-11-29 03:45:04
3    2013-12-09 03:45:04
4    2013-12-25 03:45:04
Name: time_stamp, dtype: object
object


#### Merge Dataframes

In [47]:
# Merge users_df and engagement_df on user_id and object_id
merged_df = pd.merge(users_df, engagement_df, left_on='object_id', right_on='user_id', how='inner')

# Convert time fields to datetime for easier manipulation
merged_df['time_stamp'] = pd.to_datetime(merged_df['time_stamp'])
merged_df['creation_time'] = pd.to_datetime(merged_df['creation_time'])

print(merged_df.head())

   object_id       creation_time            name                     email  \
0          1 2014-04-22 03:53:30  Clausen August  AugustCClausen@yahoo.com   
1          2 2013-11-15 03:45:04   Poole Matthew    MatthewPoole@gustr.com   
2          2 2013-11-15 03:45:04   Poole Matthew    MatthewPoole@gustr.com   
3          2 2013-11-15 03:45:04   Poole Matthew    MatthewPoole@gustr.com   
4          2 2013-11-15 03:45:04   Poole Matthew    MatthewPoole@gustr.com   

  creation_source  last_session_creation_time  opted_in_to_mailing_list  \
0    GUEST_INVITE                1.398139e+09                         1   
1      ORG_INVITE                1.396238e+09                         0   
2      ORG_INVITE                1.396238e+09                         0   
3      ORG_INVITE                1.396238e+09                         0   
4      ORG_INVITE                1.396238e+09                         0   

   enabled_for_marketing_drip  org_id  invited_by_user_id          time_stamp  \

#### Group Data by Week 

In [49]:
merged_df['login_date'] = merged_df['time_stamp'].dt.date

#login counts per day
daily_logins = merged_df.groupby(['user_id', 'login_date']).size().reset_index(name='visit_count')

# Define seven-day windows and count logins within each window
daily_logins['week_start'] = pd.to_datetime(daily_logins['login_date']) - pd.to_timedelta(7, unit='d')

# Aggregate weekly login data per user
weekly_logins = daily_logins.groupby(['user_id', 'week_start']).agg(
    login_days=('login_date', 'nunique')
).reset_index()

print(weekly_logins.head())

   user_id week_start  login_days
0        1 2014-04-15           1
1        2 2013-11-08           1
2        2 2013-11-22           1
3        2 2013-12-02           1
4        2 2013-12-18           1


#### Define Adopted User
Create a flag for adopted users based on engagement criteria: logged in on 3 separate days in at least one 7-day period.

In [51]:
# Define adopted users based on login days
weekly_logins['adopted'] = weekly_logins['login_days'] >= 3

# Summarize adoption status for each user
user_adoption = weekly_logins.groupby('user_id')['adopted'].max().reset_index()

# Merge adoption status back into the original users dataframe
users_df = pd.merge(users_df, user_adoption, left_on='object_id', right_on='user_id', how='left', suffixes=('_user', '_adopt'))

users_df = users_df.drop(columns=['user_id_adopt'], errors='ignore')

users_df['adopted'] = users_df['adopted'].fillna(False)

print(users_df.head())

   object_id        creation_time               name  \
0          1  2014-04-22 03:53:30     Clausen August   
1          2  2013-11-15 03:45:04      Poole Matthew   
2          3  2013-03-19 23:14:52  Bottrill Mitchell   
3          4  2013-05-21 08:09:28    Clausen Nicklas   
4          5  2013-01-17 10:14:20          Raw Grace   

                        email creation_source  last_session_creation_time  \
0    AugustCClausen@yahoo.com    GUEST_INVITE                1.398139e+09   
1      MatthewPoole@gustr.com      ORG_INVITE                1.396238e+09   
2  MitchellBottrill@gustr.com      ORG_INVITE                1.363735e+09   
3   NicklasSClausen@yahoo.com    GUEST_INVITE                1.369210e+09   
4          GraceRaw@yahoo.com    GUEST_INVITE                1.358850e+09   

   opted_in_to_mailing_list  enabled_for_marketing_drip  org_id  \
0                         1                           0      11   
1                         0                           0       1   

  users_df['adopted'] = users_df['adopted'].fillna(False)


In [55]:
print(users_df[['object_id', 'adopted']].head())


   object_id  adopted
0          1    False
1          2    False
2          3    False
3          4    False
4          5    False


#### Exploratory Data Analysis (EDA)
Perform visual and statistical analysis to identify predictive features:

In [None]:
# Plot adoption rates by creation source
sns.barplot(x='creation_source', y='adopted_user_flag', data=users_df)
plt.title("Adoption Rates by Creation Source")
plt.show()

# Correlation heatmap
corr = users_df.corr()
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Feature Correlations")
plt.show()


#### Feature Engineering


In [None]:
# Select features and target
features = ['creation_source', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id']
users_df['org_id'] = users_df['org_id'].fillna(-1)  # Handle missing org_id
X = pd.get_dummies(users_df[features])  # One-hot encoding for categorical variables
y = users_df['adopted_user_flag']

# Split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Predictive Modeling


In [None]:
# Train Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


#### Visualize Feature Importance

In [None]:
importance = model.feature_importances_
feature_names = X.columns
sns.barplot(x=importance, y=feature_names)
plt.title("Feature Importance")
plt.show()

### Insights and Recommendations
Summarize the findings:

Adoption Trends: Users signing up via Google Authentication are more likely to adopt.

Engagement: Marketing drip emails and organization affiliation correlate with higher adoption rates.