In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

class BankModel:
    def __init__(self ):
        self.clf = None
        self.features = None
        self.test = None
        
    def fit(self, features, y_train, class_weight=None):
        self.features = features
        self.clf = LogisticRegression(class_weight=class_weight, random_state=1)
        
        # train the model
        self.clf.fit(self.features, y_train)
#         print(f"The best parameters are {self.clf.best_params_} with a score of {self.clf.best_score_} on validation data")
    
    def get_test_predict(self, text):
        self.test = text
        pred_val = self.predict(self.test)
        return pred_val
    
    def get_test_predict_proba(self, text):
        self.test = text
        pred_prob = self.clf.predict_proba(self.test)
        return pred_prob
        
    def get_metrics(self, y_test, pred_val):
        print("Report for test data \n\n", classification_report(y_test, pred_val))
        
    def predict(self, X):
        return self.clf.predict(X)

    def __call__(self, X):
        return self.predict(X)

In [23]:
from sklearn.model_selection import train_test_split

data = pd.read_csv("out/cleanedDataClassOccupation.csv")
before_one_month = pd.read_csv("out/cleaned_Train_2.csv")
before_two_months = pd.read_csv("out/cleaned_Train_1.csv")
extended_data = data
# extended_data["bal_current_account_one_month"] = before_one_month["bal_current_account"]
# extended_data["bal_current_account_two_month"] = before_two_months["bal_current_account"]
# extended_data["bal_savings_account_one_month"] = before_one_month["bal_savings_account"]
# extended_data["bal_savings_accountt_two_month"] = before_two_months["bal_savings_account"]
# X = data.iloc[:, 1:38]  #independent columns
# y = data["target"]    #target column i.e price range
# cleaned_data_wo_client_id = data_cleaned.loc[:, data_cleaned.columns != 'client_id']
# prediction_data_probs = model.predict(prediction_data_wo_client_id)

In [24]:

print(extended_data.shape)
# 1:extended_data.shape[1]
features = extended_data.iloc[:, (extended_data.columns != 'target') & (extended_data.columns != 'client_id')] # all features 
X_train, X_test, y_train, y_test = train_test_split(features, extended_data["target"] , test_size=0.15, stratify=data["target"], random_state=1)

(63697, 39)


### Class Weight
This is a crucial part of learning

In [25]:
from sklearn.utils import compute_class_weight
#Since we have a class imbalance let's create a dictionary with class weights to balance this. This step helps the model give equal attention to less frequent training examples, be making mistakes
#on these examples more costly.
classes = np.unique(y_train,return_counts=True)[0]
class_weights_arr = compute_class_weight(class_weight = 'balanced', classes = classes, y = y_train)
print(classes)
print(class_weights_arr)

class_weights_dict = {} #input to model.fit requires dictionary
for i in classes:
    class_weights_dict[i] = class_weights_arr[i]
print(class_weights_dict)

[0 1]
[ 0.515481   16.64883149]
{0: 0.5154809962678041, 1: 16.648831488314883}


In [26]:
%%time
model = BankModel()
model.fit(X_train, y_train, class_weight=class_weights_dict)
model.get_metrics(y_test, model.get_test_predict(X_test))

Report for test data 

               precision    recall  f1-score   support

           0       0.99      0.67      0.79      9268
           1       0.06      0.67      0.11       287

    accuracy                           0.67      9555
   macro avg       0.52      0.67      0.45      9555
weighted avg       0.96      0.67      0.77      9555

CPU times: user 1.82 s, sys: 59.1 ms, total: 1.88 s
Wall time: 537 ms


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# TEST THE MODEL ON TEST DATA
prediction_data = pd.read_csv("out/cleanedDataNoClassOccupation.csv") # USE !pwd command to replace the path of `read_csv` if short ones dont work for you
prediction_before_one_month = pd.read_csv("out/cleaned_Test_2.csv")
prediction_before_two_months = pd.read_csv("out/cleaned_Test_1.csv")
extended_prediction_data = prediction_data
# extended_prediction_data["bal_current_account_one_month"] = prediction_before_one_month["bal_current_account"]
# extended_prediction_data["bal_current_account_two_month"] = prediction_before_two_months["bal_current_account"]
# extended_prediction_data["bal_savings_account_one_month"] = prediction_before_one_month["bal_savings_account"]
# extended_prediction_data["bal_savings_account_two_month"] = prediction_before_two_months["bal_savings_account"]
prediction_data_wo_client_id = extended_prediction_data.loc[:, extended_prediction_data.columns != 'client_id']
prediction_data_probs = model.get_test_predict_proba(prediction_data_wo_client_id)
probs = []
for arr in prediction_data_probs:
    probs.append(arr[1])
prediction_data["target"] = probs
prediction_data[["client_id","target"]].to_csv('out/predictions.csv',
                 sep=',', encoding='utf-8', index=False, header=False)

### Upsampling

In [12]:
# from sklearn.utils import resample

# churn_data = data[data["target"] == 1]
# not_churn_data = data[data["target"] == 0]

# print(churn_data.shape)
# print(not_churn_data.shape)
# churn_upsample = resample(churn_data,
#              replace=True,
#              n_samples=len(not_churn_data),
#              random_state=42)
# upsampled_data = pd.concat([not_churn_data, churn_upsample])

(1913, 39)
(61784, 39)


### Downsampling

In [20]:
# from sklearn.utils import resample

# churn_data = data[data["target"] == 1]
# not_churn_data = data[data["target"] == 0]

# print(churn_data.shape)
# print(not_churn_data.shape)
# not_churn_downsample = resample(not_churn_data,
#              replace=True,
#              n_samples=len(churn_data),
#              random_state=42)
# downsampled_data = pd.concat([not_churn_downsample, churn_data])

(1913, 39)
(61784, 39)


In [19]:
# %%time

# features = upsampled_data.iloc[:, (upsampled_data.columns != 'target') & (upsampled_data.columns != 'client_id')] # all features 
# X_train, X_test, y_train, y_test = train_test_split(features, upsampled_data["target"] , test_size=0.15, stratify=upsampled_data["target"], random_state=1)

# model = BankModel()
# model.fit(X_train, y_train, class_weight=class_weights_dict)
# model.get_metrics(y_test, model.get_test_predict(X_test))

Report for test data 

               precision    recall  f1-score   support

           0       0.66      0.67      0.66      9268
           1       0.66      0.65      0.66      9268

    accuracy                           0.66     18536
   macro avg       0.66      0.66      0.66     18536
weighted avg       0.66      0.66      0.66     18536

CPU times: user 3.31 s, sys: 102 ms, total: 3.41 s
Wall time: 1.01 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# %%time

# features = downsampled_data.iloc[:, (downsampled_data.columns != 'target') & (downsampled_data.columns != 'client_id')] # all features 
# X_train, X_test, y_train, y_test = train_test_split(features, downsampled_data["target"] , test_size=0.15, stratify=downsampled_data["target"], random_state=1)

# model = BankModel()
# model.fit(X_train, y_train, class_weight=class_weights_dict)
# model.get_metrics(y_test, model.get_test_predict(X_test))

Report for test data 

               precision    recall  f1-score   support

           0       0.67      0.69      0.68       287
           1       0.68      0.65      0.67       287

    accuracy                           0.67       574
   macro avg       0.67      0.67      0.67       574
weighted avg       0.67      0.67      0.67       574

CPU times: user 217 ms, sys: 18.2 ms, total: 235 ms
Wall time: 78.3 ms
