<a href="https://colab.research.google.com/github/bee-llel/Machine-Learning/blob/master/LogisticRegression_kaggle_flo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline

In [None]:
train = pd.read_csv("challenge1_train.csv")
test = pd.read_csv("challenge1_test.csv")

In [None]:
# convert hex to int
columns_hex = ["f2", "f3", "f13", "f18", "f20", "f26"]

def tryconvert(x):
    if isinstance(x, str):
        try:
            return float(int(x, 16))
        except:
            return float(x)
    return x


for f in columns_hex:
    train[f] = train[f].apply(lambda x: tryconvert(x))
    test[f] = test[f].apply(lambda x: tryconvert(x))

In [None]:
print("Fill missing values in numerical attribute not categorical")
features = ["f2", "f3", "f4", "f20"]
for f in features:
    median = train[f].median()
    train[f].fillna(median, inplace=True)
    test[f].fillna(median, inplace=True)
    
print("Scale numerical value")
for f in features:
    scale = StandardScaler()
    train[f] = scale.fit_transform(train[[f]])
    test[f] = scale.transform(test[[f]])

Fill missing values in numerical attribute not categorical
Scale numerical value


In [None]:
print("Fill missing values in categorical attributes")

most_frequent = ["f5", "f6", "f7", "f10", "f14", "f22", "f24"]
new_cat = ["f0", "f9", "f11", "f13", "f15", "f18", "f26"]
b_fill = ["f1", "f8", "f12", "f16", "f17", "f19", "f21", "f23", "f25", "f27", "f28"]

for f in most_frequent:
    most_frequent_category = train[f].mode()[0]
    train[f].fillna(most_frequent_category, inplace=True)
    test[f].fillna(most_frequent_category, inplace=True)
                   
for f in new_cat:
    train[f] = np.where(train[f].isnull(),"Unknown_miss", train[f])
    test[f] = np.where(test[f].isnull(),"Unknown_miss", test[f])

for f in b_fill:
    train[f].fillna(method='bfill', inplace=True)
    test[f].fillna(method='bfill', inplace=True)

Fill missing values in categorical attributes


In [None]:
print("Numerical categorical attribute encoded")
features = ["f6", "f8", "f10", "f13", "f16", "f17", "f18", "f19", "f21", "f22", "f25", "f26", "f28"]

for f in features:
    encoder = OneHotEncoder()
    feature_arr = encoder.fit_transform(train[[f]]).toarray()
    feature_arr_test = encoder.transform(test[[f]]).toarray()
    
    feature_labels = train[f].unique()
    feature_labels_f = []
    for i in range(len(feature_labels)):
        feature_labels_f.append(f + "_" + str(feature_labels[i]))
    train = pd.concat([train, pd.DataFrame(feature_arr, columns=feature_labels_f)], axis=1)
    test = pd.concat([test, pd.DataFrame(feature_arr_test, columns=feature_labels_f)], axis=1)

Numerical categorical attribute encoded


In [None]:
print("String categorical attribute encoded")
features = ["f0", "f1", "f5", "f7", "f9", "f11", "f12", "f14", "f15",
            "f23", "f24", "f27"]

for f in features:
    ordinal_encoder = OrdinalEncoder()
    train[f] = ordinal_encoder.fit_transform(train[[f]])
    test[f] = ordinal_encoder.transform(test[[f]])
    
    encoder = OneHotEncoder()
    feature_arr = encoder.fit_transform(train[[f]]).toarray()
    feature_arr_test = encoder.transform(test[[f]]).toarray()
    
    feature_labels = ordinal_encoder.categories_[0]
    for i in range(len(feature_labels)):
        feature_labels[i] = f + "_" + feature_labels[i]
        
    train = pd.concat([train, pd.DataFrame(feature_arr, columns=feature_labels)], axis=1)
    test = pd.concat([test, pd.DataFrame(feature_arr_test, columns=feature_labels)], axis=1)

String categorical attribute encoded


In [None]:
for i in range(29):
    if i != 2 and i!=3 and i!=4 and i!=20:
        f = "f" + str(i)
        del train[f]
        del test[f]
        
train.head()

Unnamed: 0,id,target,f2,f3,f4,f20,f6_0.0,f6_1.0,f6_2.0,f8_4.0,f8_6.0,f8_5.0,f8_1.0,f8_2.0,f8_3.0,f10_1.0,f10_10.0,f10_0.0,f10_11.0,f13_4182685748.0,f13_23716334086.0,f13_33684466775.0,f13_6004677120.0,f13_14278058011.0,f13_24288028185.0,f13_57057785775.0,f13_31306885476.0,f13_40678277727.0,f13_65529504716.0,f13_32314880168.0,f13_50891864608.0,f13_58272053479.0,f13_31234525649.0,f13_41980480889.0,f13_23748412273.0,f13_67573361372.0,f13_19407661954.0,f13_Unknown_miss,f13_5889044980.0,f13_7073006014.0,...,f15_K,f15_L,f15_M,f15_N,f15_O,f15_P,f15_Q,f15_R,f15_S,f15_T,f15_U,f15_Unknown_miss,f15_V,f15_W,f15_X,f15_Y,f15_Z,f23_A,f23_B,f23_C,f23_D,f23_E,f23_F,f24_N,f24_S,f27_a,f27_b,f27_c,f27_d,f27_e,f27_f,f27_g,f27_h,f27_i,f27_j,f27_k,f27_l,f27_m,f27_n,f27_o
0,0,0,-1.456242,0.875097,-0.633735,0.903143,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,1,-1.237549,-0.929716,-1.184213,-0.150117,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2,0,-0.330477,0.241535,-1.407721,0.925119,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0,0.34799,1.287282,-0.385688,1.208644,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0,0.091534,1.565784,0.235092,-0.027925,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
id_train = train["id"]
X_train = train.iloc[:, 2:]
Y_train = train["target"]

id_test = test["id"]
X_test = test.iloc[:, 1:]

In [None]:
skb_lr = Pipeline([
    # the reduce_dim stage is populated by the param_grid
    ('skb', SelectKBest(k=1200, score_func=f_classif)),
    ('lr', LogisticRegression(C=0.09, class_weight=None, max_iter=1000, penalty='l2', solver='liblinear'))
])

In [None]:
skb_lr.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('skb',
                 SelectKBest(k=1200,
                             score_func=<function f_classif at 0x7f43430764d0>)),
                ('lr',
                 LogisticRegression(C=0.09, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1000,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [None]:
roc_auc_score(Y_train, skb_lr.predict_proba(X_train)[:,1])

0.7875263290191717

In [None]:
y_pred = skb_lr.predict_proba(X_test)
y_pred = y_pred[:,1]

test_predictions = []

for i in range(len(y_pred)):
    test_predictions.append({'id':id_test[i], 'target':y_pred[i]})

test_predictions_df = pd.DataFrame.from_records(test_predictions)
test_predictions_df.to_csv('skb_lr_predictions.csv', index=False)