In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

RANDOM_SEED = 42

In [2]:
df = pd.read_csv("../data/skillshare_combined.csv", index_col=0)
df.create_time = pd.to_datetime(df.create_time)
df

Unnamed: 0,user_uid,create_time,success,is_cancelled,payment_provider_cat_codes,payment_ux_cat_codes,trial_length_offer_cat_codes,sub_utm_channel_cat_codes,sub_utm_source_cat_codes,day-1,...,day-31,comment_volume,comment_score,discussion_volume,discussion_score,follow_volume,projects_volume,projects_score,review_volume,rating_avg
0,23692129,2022-01-01 00:00:02,0,True,2,2,0,10,30,60.0,...,0.0,,,,,,,,,
1,23674285,2022-01-01 00:00:27,0,False,-1,-1,0,2,116,0.0,...,0.0,,,,,,,,,
2,16119588,2022-01-01 00:01:19,1,False,1,5,0,10,30,347.0,...,0.0,,,,,,,,,
3,23693101,2022-01-01 00:01:42,0,False,2,2,0,16,55,1211.0,...,0.0,,,,,,,,,
4,23692349,2022-01-01 00:02:04,1,False,2,2,0,16,55,0.0,...,0.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475443,26284161,2022-05-31 23:56:47,0,False,2,2,0,0,35,0.0,...,0.0,,,,,,,,,
475444,26259150,2022-05-31 23:56:53,0,False,2,2,0,17,227,0.0,...,0.0,,,,,,,,,
475445,26284159,2022-05-31 23:57:46,0,True,1,5,0,17,227,0.0,...,0.0,,,,,,,,,
475446,9742534,2022-05-31 23:58:00,0,True,1,5,0,10,30,5.0,...,0.0,,,,,,,,,


In [4]:
length = len(df.success.dropna())
df.success.value_counts() / length

0    0.760815
1    0.239185
Name: success, dtype: float64

In [3]:
# One hot encodings for categorical columns
categorical_features = [column for column in df.columns if "cat_codes" in column]
for column in categorical_features:
    if "cat_codes" in column:
        series = df[column]
        ohe = pd.get_dummies(series, prefix=column)
        print(ohe.columns)
        df.drop(columns=[column], inplace=True)
        df = pd.merge(df, ohe, left_index=True, right_index=True)

Index(['payment_provider_cat_codes_-1', 'payment_provider_cat_codes_0',
       'payment_provider_cat_codes_1', 'payment_provider_cat_codes_2',
       'payment_provider_cat_codes_3'],
      dtype='object')
Index(['payment_ux_cat_codes_-1', 'payment_ux_cat_codes_0',
       'payment_ux_cat_codes_1', 'payment_ux_cat_codes_2',
       'payment_ux_cat_codes_3', 'payment_ux_cat_codes_4',
       'payment_ux_cat_codes_5', 'payment_ux_cat_codes_6',
       'payment_ux_cat_codes_7', 'payment_ux_cat_codes_8',
       'payment_ux_cat_codes_9'],
      dtype='object')
Index(['trial_length_offer_cat_codes_0', 'trial_length_offer_cat_codes_1'], dtype='object')
Index(['sub_utm_channel_cat_codes_0', 'sub_utm_channel_cat_codes_1',
       'sub_utm_channel_cat_codes_2', 'sub_utm_channel_cat_codes_3',
       'sub_utm_channel_cat_codes_4', 'sub_utm_channel_cat_codes_5',
       'sub_utm_channel_cat_codes_6', 'sub_utm_channel_cat_codes_7',
       'sub_utm_channel_cat_codes_8', 'sub_utm_channel_cat_codes_9',
      

In [4]:
df = df.fillna(0)
df

Unnamed: 0,user_uid,create_time,success,is_cancelled,day-1,day-2,day-3,day-4,day-5,day-6,...,sub_utm_source_cat_codes_221,sub_utm_source_cat_codes_222,sub_utm_source_cat_codes_223,sub_utm_source_cat_codes_224,sub_utm_source_cat_codes_225,sub_utm_source_cat_codes_226,sub_utm_source_cat_codes_227,sub_utm_source_cat_codes_228,sub_utm_source_cat_codes_229,sub_utm_source_cat_codes_230
0,23692129,2022-01-01 00:00:02,0,True,60.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,23674285,2022-01-01 00:00:27,0,False,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,16119588,2022-01-01 00:01:19,1,False,347.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,23693101,2022-01-01 00:01:42,0,False,1211.0,266.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,23692349,2022-01-01 00:02:04,1,False,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475443,26284161,2022-05-31 23:56:47,0,False,0.0,21.0,2.0,14.0,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
475444,26259150,2022-05-31 23:56:53,0,False,0.0,1508.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
475445,26284159,2022-05-31 23:57:46,0,True,0.0,217.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
475446,9742534,2022-05-31 23:58:00,0,True,5.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
y = df.success.values
mmscaler = MinMaxScaler()
X = mmscaler.fit_transform(df.drop(columns=["user_uid", "create_time", "success", "is_cancelled"]))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_SEED)

In [8]:
mlp = MLPClassifier(
    hidden_layer_sizes=100, random_state=RANDOM_SEED
)
mlp.fit(X_train, y_train)
mlp.score(X_test, y_test)

0.7624472076862243