In [29]:
import numpy as np 
import pandas as pd
from sklearn.linear_model import LogisticRegression

### Train data

In [2]:
x_log = pd.read_csv('../../kkbox-churn-prediction-challenge/50_under_sample/train_log_transformed.csv')

In [3]:
x_log.head(1)

Unnamed: 0,msno,date_count,date_first,num_25_sum,num_25_mean,num_25_std,num_50_sum,num_50_mean,num_50_std,num_75_sum,...,num_985_std,num_100_sum,num_100_mean,num_100_std,num_unq_sum,num_unq_mean,num_unq_std,total_secs_sum,total_secs_mean,total_secs_std
0,++9JUsaZioofS6Fb20Z0z2QOlWNzakO0PRF5GZ75yi4=,462,2015-01-01T00:00:00,712,1.541126,2.550453,153,0.331169,0.725027,144,...,0.781416,7324,15.852814,17.80626,6118,13.242424,14.098572,1949101.002,4218.833338,4631.316598


In [4]:
member = pd.read_csv('../../kkbox-churn-prediction-challenge/50_under_sample/train_member_transformed.csv')

In [5]:
member = member.sort_values(by=['msno'])

In [6]:
member.head(1)

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,register_days
35439,++9JUsaZioofS6Fb20Z0z2QOlWNzakO0PRF5GZ75yi4=,0,13,22,male,9,2014-12-10,783


In [7]:
df_merged = pd.merge(x_log,member,on=['msno'],how='inner')

In [8]:
df_merged.shape

(53483, 31)

In [9]:
df_merged = df_merged.drop(['date_first'], axis=1)

In [10]:
df_merged.head(1)

Unnamed: 0,msno,date_count,num_25_sum,num_25_mean,num_25_std,num_50_sum,num_50_mean,num_50_std,num_75_sum,num_75_mean,...,total_secs_sum,total_secs_mean,total_secs_std,is_churn,city,bd,gender,registered_via,registration_init_time,register_days
0,++9JUsaZioofS6Fb20Z0z2QOlWNzakO0PRF5GZ75yi4=,462,712,1.541126,2.550453,153,0.331169,0.725027,144,0.311688,...,1949101.002,4218.833338,4631.316598,0,13,22,male,9,2014-12-10,783


In [11]:
X_train = df_merged.iloc[:, 1:23]

In [12]:
X_train.shape

(53483, 22)

In [13]:
X_train.columns

Index(['date_count', 'num_25_sum', 'num_25_mean', 'num_25_std', 'num_50_sum',
       'num_50_mean', 'num_50_std', 'num_75_sum', 'num_75_mean', 'num_75_std',
       'num_985_sum', 'num_985_mean', 'num_985_std', 'num_100_sum',
       'num_100_mean', 'num_100_std', 'num_unq_sum', 'num_unq_mean',
       'num_unq_std', 'total_secs_sum', 'total_secs_mean', 'total_secs_std'],
      dtype='object')

In [14]:
num_attribs = ['date_count', 'num_25_sum', 'num_25_mean', 'num_25_std', 'num_50_sum',
       'num_50_mean', 'num_50_std', 'num_75_sum', 'num_75_mean', 'num_75_std',
       'num_985_sum', 'num_985_mean', 'num_985_std', 'num_100_sum',
       'num_100_mean', 'num_100_std', 'num_unq_sum', 'num_unq_mean',
       'num_unq_std', 'total_secs_sum', 'total_secs_mean', 'total_secs_std']

In [15]:
y_train = df_merged.iloc[:,23]

In [17]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

In [18]:
num_pipeline = Pipeline([
    ('selector',DataFrameSelector(num_attribs)),
    ('std_scaler',StandardScaler())
])

In [19]:
full_pipeline = FeatureUnion(transformer_list=[
    #("cat_pipeline",cat_pipeline),
    ("num_pipeline",num_pipeline)
])

In [33]:
X_train_prepared = full_pipeline.fit_transform(X_train)

### Validation data

In [21]:
## Get validation data
x_log_val = pd.read_csv('../../kkbox-churn-prediction-challenge/50_under_sample/val_user_logs_transformed.csv')

In [22]:
x_log_val.head(1)

Unnamed: 0,msno,date_count,date_first,num_25_sum,num_25_mean,num_25_std,num_50_sum,num_50_mean,num_50_std,num_75_sum,...,num_985_std,num_100_sum,num_100_mean,num_100_std,num_unq_sum,num_unq_mean,num_unq_std,total_secs_sum,total_secs_mean,total_secs_std
0,++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,466,2015-01-06T00:00:00,1157,2.482833,3.640974,558,1.197425,2.478222,309,...,2.163547,4933,10.585837,14.414678,6858,14.716738,16.146014,1435438.386,3080.339884,4020.515098


In [23]:
y_val = pd.read_csv('../../kkbox-churn-prediction-challenge/50_under_sample/val_members.csv')

In [25]:
val = pd.merge(x_log_val,y_val,on=['msno'],how='inner').drop(['date_first'], axis=1)

In [35]:
X_val = val.iloc[:, 1:23]
y_val = val.iloc[:,23]
X_val_prepared = full_pipeline.fit_transform(X_val)

### Tune Hyper

In [37]:
from sklearn.metrics import precision_recall_fscore_support, f1_score
def tune_parameters(solver,C_param_list,penalty):
    table = pd.DataFrame(columns = ['C_parameter','f1'])
    table['C_parameter'] = C_param_list
    
    j = 0
    for i in C_param_list:
        lr = LogisticRegression(C = i,max_iter=1500,penalty=penalty,tol=7e-4,random_state=24,solver=solver)
        lr.fit(X_train_prepared,y_train)
        y_val_pred = lr.predict(X_val_prepared)
        table.iloc[j,1] = f1_score(y_val,y_val_pred)
        j+=1
    
    return table

In [38]:
# solver = "lbfgs", penalty = "l2"
lbfgs_l2 = tune_parameters("lbfgs",[0.1,1,10,100,1000],"l2")

In [39]:
lbfgs_l2 # 1 is best

Unnamed: 0,C_parameter,f1
0,0.1,0.118573
1,1.0,0.119169
2,10.0,0.119169
3,100.0,0.119169
4,1000.0,0.119169


In [42]:
# solver = "saga", penalty = "l2"
saga_l2 = tune_parameters("saga",[0.01,0.1,1,10],"l2")

In [43]:
saga_l2  # 1 is best

Unnamed: 0,C_parameter,f1
0,0.01,0.114151
1,0.1,0.118409
2,1.0,0.118354
3,10.0,0.118299


In [44]:
# solver = "saga", penalty = "l1"
saga_l1 = tune_parameters("saga",[0.1,1,10,100],"l1")

In [45]:
saga_l1 # 1 is best

Unnamed: 0,C_parameter,f1
0,0.1,0.118143
1,1.0,0.118409
2,10.0,0.118299
3,100.0,0.118299


### Model and Test

In [46]:
log = LogisticRegression(C = 1,max_iter=1500,penalty='l2',tol=7e-4,random_state=24,solver='lbfgs')
log.fit(X_train_prepared,y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=24, solver='lbfgs', tol=0.0007, verbose=0,
                   warm_start=False)

In [47]:
## Get validation data
x_log_test = pd.read_csv('../../kkbox-churn-prediction-challenge/50_under_sample/test_user_logs_transformed.csv')

In [48]:
y_test = pd.read_csv('../../kkbox-churn-prediction-challenge/50_under_sample/test_members.csv')

In [49]:
test = pd.merge(x_log_test,y_test,on=['msno'],how='inner').drop(['date_first'], axis=1)

In [51]:
X_test = test.iloc[:, 1:23]
y_test = test.iloc[:,23]
X_test_prepared = full_pipeline.fit_transform(X_test)

In [54]:
y_test_pred=log.predict(X_test_prepared)
precision_recall_fscore_support(y_test,y_test_pred,average='binary')

(0.10034364261168385, 0.18295739348370926, 0.1296049711495783, None)

In [56]:
log.coef_.shape

(1, 22)