In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [67]:
# Load data
users_df = pd.read_csv("takehome_users.csv", encoding='latin-1')
engagement_df = pd.read_csv("takehome_user_engagement.csv")

In [68]:
engagement_df['time_stamp'] = pd.to_datetime(engagement_df['time_stamp'])


In [69]:
def is_adopted(dates):
    dates = sorted(dates)
    for i in range(len(dates) - 2):
        if (dates[i+2] - dates[i]).days <= 7:
            return 1
    return 0


In [70]:
user_logins = engagement_df.groupby('user_id')['time_stamp'].apply(is_adopted).reset_index()
user_logins.columns = ['object_id', 'adopted']

In [71]:
users_df = users_df.merge(user_logins, on='object_id', how='left')
users_df['adopted'] = users_df['adopted'].fillna(0).astype(int)

In [72]:
user_logins.head()

Unnamed: 0,object_id,adopted
0,1,0
1,2,1
2,3,0
3,4,0
4,5,0


In [73]:
users_df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0


In [74]:
# Merge adoption label with user dataset
users_df = users_df.merge(user_logins[['object_id', 'adopted']], left_on='object_id', right_on='object_id', how='left')
users_df['adopted'] = users_df['adopted'].fillna(0).astype(int)

KeyError: 'adopted'

In [None]:
users_df['creation_time'] = pd.to_datetime(users_df['creation_time'])
users_df['last_session_creation_time'] = pd.to_datetime(users_df['last_session_creation_time'], unit='s', errors='coerce')

users_df['account_lifetime_days'] = (users_df['last_session_creation_time'] - users_df['creation_time']).dt.days.fillna(0)
users_df['was_invited'] = users_df['invited_by_user_id'].notnull().astype(int)
users_df['org_size'] = users_df['org_id'].map(users_df['org_id'].value_counts())


In [None]:
# Prepare final dataset
df = users_df[[
    'adopted', 'creation_source', 'opted_in_to_mailing_list',
    'enabled_for_marketing_drip', 'was_invited',
    'account_lifetime_days', 'org_size'
]]

In [None]:
df = pd.get_dummies(df, columns=['creation_source'], drop_first=True)


In [None]:
print("\n=== ADOPTION RATE ===")
print(df['adopted'].value_counts(normalize=True))


In [None]:
print("\n=== Feature Summary ===")
print(df.describe(include='all'))

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Boxplots for numerical features
for col in ['account_lifetime_days', 'org_size']:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x='adopted', y=col, data=df)
    plt.title(f"{col} vs Adopted")
    plt.show()


In [None]:
# Count plots for categorical
for col in [c for c in df.columns if "creation_source_" in c]:
    sns.barplot(x=df[col], y=df['adopted'])
    plt.title(f"{col} vs Adoption Rate")
    plt.ylabel("Adoption Rate")
    plt.show()

In [None]:
X = df.drop('adopted', axis=1)
y = df['adopted']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))


In [None]:
# Feature importance
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

In [None]:
print("\n=== Feature Importances ===")
print(importances)



In [None]:
# Plot
plt.figure(figsize=(10, 5))
sns.barplot(x=importances.values, y=importances.index)
plt.title("Random Forest Feature Importances")
plt.tight_layout()
plt.show()

Here its clear that account-lifetime-days & org_size are the most important features for classification of adopted or not