In [4]:
import pandas as pd
import numpy as np

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

class BankModel:
    def __init__(self ):
        self.clf = None
        self.features = None
        self.test = None
        
    def fit(self, features, y_train, class_weight=None):
        self.features = features
        self.clf = LogisticRegression(class_weight=class_weight, random_state=1)
        
        # train the model
        self.clf.fit(self.features, y_train)
#         print(f"The best parameters are {self.clf.best_params_} with a score of {self.clf.best_score_} on validation data")
    
    def get_test_predict(self, text):
        self.test = text
        pred_val = self.predict(self.test)
        return pred_val
    
    def get_test_predict_proba(self, text):
        self.test = text
        pred_prob = self.clf.predict_proba(self.test)
        return pred_prob
        
    def get_metrics(self, y_test, pred_val):
        print("Report for test data \n\n", classification_report(y_test, pred_val))
        
    def predict(self, X):
        return self.clf.predict(X)

    def __call__(self, X):
        return self.predict(X)

In [13]:
from sklearn.model_selection import train_test_split

data = pd.read_csv("out/cleanedDataClassOccupation.csv")
before_one_month = pd.read_csv("out/cleaned_Train_2.csv")
before_two_months = pd.read_csv("out/cleaned_Train_1.csv")
extended_data = data
extended_data["bal_current_account_one_month"] = before_one_month["bal_current_account"]
extended_data["bal_current_account_two_month"] = before_two_months["bal_current_account"]
extended_data["bal_savings_account_one_month"] = before_one_month["bal_savings_account"]
extended_data["bal_savings_accountt_two_month"] = before_two_months["bal_savings_account"]
# X = data.iloc[:, 1:38]  #independent columns
# y = data["target"]    #target column i.e price range
# cleaned_data_wo_client_id = data_cleaned.loc[:, data_cleaned.columns != 'client_id']
# prediction_data_probs = model.predict(prediction_data_wo_client_id)

Unnamed: 0,bal_current_account,bal_current_account_one_month
0,0.075714,0.146876
1,0.092381,0.096805
2,0.105238,0.104435
3,0.047619,0.046257
4,0.047619,0.046257
...,...,...
63692,0.051429,0.058178
63693,0.175714,0.152599
63694,0.186667,0.200286
63695,0.047619,0.046257


In [28]:

print(extended_data.shape)
# 1:extended_data.shape[1]
features = extended_data.iloc[:, (extended_data.columns != 'target') & (extended_data.columns != 'client_id')] # all features 
X_train, X_test, y_train, y_test = train_test_split(features, extended_data["target"] , test_size=0.15, stratify=data["target"], random_state=1)

(63697, 43)


### Class Weight
This is a crucial part of learning

In [29]:
from sklearn.utils import compute_class_weight
#Since we have a class imbalance let's create a dictionary with class weights to balance this. This step helps the model give equal attention to less frequent training examples, be making mistakes
#on these examples more costly.
classes = np.unique(y_train,return_counts=True)[0]
class_weights_arr = compute_class_weight(class_weight = 'balanced', classes = classes, y = y_train)
print(classes)
print(class_weights_arr)

class_weights_dict = {} #input to model.fit requires dictionary
for i in classes:
    class_weights_dict[i] = class_weights_arr[i]
print(class_weights_dict)

[0 1]
[ 0.515481   16.64883149]
{0: 0.5154809962678041, 1: 16.648831488314883}


In [30]:
%%time
model = BankModel()
model.fit(X_train, y_train, class_weight=class_weights_dict)
model.get_metrics(y_test, model.get_test_predict(X_test))

Report for test data 

               precision    recall  f1-score   support

           0       0.98      0.68      0.80      9268
           1       0.06      0.66      0.11       287

    accuracy                           0.68      9555
   macro avg       0.52      0.67      0.46      9555
weighted avg       0.96      0.68      0.78      9555

CPU times: user 1.97 s, sys: 94.7 ms, total: 2.07 s
Wall time: 627 ms


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
# TEST THE MODEL ON TEST DATA
prediction_data = pd.read_csv("out/cleanedDataNoClassOccupation.csv") # USE !pwd command to replace the path of `read_csv` if short ones dont work for you
prediction_before_one_month = pd.read_csv("out/cleaned_Test_2.csv")
prediction_before_two_months = pd.read_csv("out/cleaned_Test_1.csv")
extended_prediction_data = prediction_data
extended_prediction_data["bal_current_account_one_month"] = prediction_before_one_month["bal_current_account"]
extended_prediction_data["bal_current_account_two_month"] = prediction_before_two_months["bal_current_account"]
extended_prediction_data["bal_savings_account_one_month"] = prediction_before_one_month["bal_savings_account"]
extended_prediction_data["bal_savings_account_two_month"] = prediction_before_two_months["bal_savings_account"]
prediction_data_wo_client_id = extended_prediction_data.loc[:, extended_prediction_data.columns != 'client_id']
prediction_data_probs = model.get_test_predict_proba(prediction_data_wo_client_id)
probs = []
for arr in prediction_data_probs:
    probs.append(arr[1])
prediction_data["target"] = probs
prediction_data[["client_id","target"]].to_csv('out/predictions.csv',
                 sep=',', encoding='utf-8', index=False, header=False)

In [37]:
extended_prediction_data

Unnamed: 0,client_id,homebanking_active,has_homebanking,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,...,flanders_postal_code,wallonia_postal_code,other_postal_code,customer_occupation_code_0,customer_self_employed,bal_current_account_one_month,bal_current_account_two_month,bal_savings_account_one_month,bal_savings_account_two_month,target
0,ccf4cd93d5c32cd8a59809d54b4d53ac,0,0,0,0,0,0,0,0,1,...,1,0,0,1.000000,0,0.107296,0.083174,0.1038,0.1290,0.475068
1,56605a660d18549592653ff6941186f1,0,0,0,0,0,0,1,0,1,...,1,0,0,1.000000,0,0.048641,0.046367,0.3002,0.3002,0.404575
2,bda5f84c05e5695a7ec10550b457890f,0,0,0,0,0,1,1,0,1,...,1,0,0,1.000000,0,0.204101,0.202677,0.0740,0.0740,0.727583
3,a2f1c04bc3acf2222e658a897400798f,0,0,0,0,0,0,1,0,1,...,0,1,0,1.000000,0,0.215069,0.190249,0.5500,0.5500,0.468477
4,e83aadc3b0d25dbc12a35551afa25807,0,0,0,0,0,0,0,0,0,...,1,0,0,1.000000,1,0.047210,0.044933,0.2038,0.2038,0.360046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27295,1a634b5cfe6dbf35e77ce6519c0e3939,0,0,0,0,0,0,1,0,1,...,1,0,0,1.000000,0,0.327134,0.309273,0.0094,0.0094,0.511677
27296,84b7af2ddbeb919ab238ef7e6a83936c,0,0,0,0,0,0,0,0,0,...,0,1,0,1.000000,0,0.047210,0.044933,0.1308,0.1308,0.463272
27297,c257c09e3f4445d67b52a1007b0205df,0,0,0,0,0,0,0,0,0,...,0,1,0,1.000000,0,0.047210,0.044933,0.4024,0.4024,0.324093
27298,fdd7a1b7e58f642a1fc4867b8f7cdf3a,0,0,0,0,0,0,1,0,1,...,1,0,0,1.000000,0,0.114449,0.108987,0.3956,0.3956,0.427139
