In [16]:
import numpy as np
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from xgboost import plot_importance

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.utils import class_weight

In [3]:
df = pd.read_csv("cleaned_data.csv", index_col=0)

df = df[df["CODE_GENDER"] != "XNA"] # filter out gender = XNA bc only 4 entries
categorical = ["NAME_CONTRACT_TYPE", "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY"] # change categorical variables to 0, 1 encoding
for col in categorical:
  fst = df[col].iloc[0]
  df[col] = (df[col] == fst).astype(float)

df = df.drop(columns=["OWN_CAR_AGE"]) # drop this because we have the new categorical variables for this info

In [4]:
# drop entries with a nan
df = df.dropna(axis=0)

In [5]:
# figure out which columns only have one value
for i in df.columns:
    # if len(pd.unique(df[i])) <= 1:
    print(i, pd.unique(df[i]))

SK_ID_CURR [100002 100003 100004 ... 456253 456254 456255]
TARGET [1 0]
NAME_CONTRACT_TYPE [1. 0.]
CODE_GENDER [1. 0.]
FLAG_OWN_CAR [1. 0.]
FLAG_OWN_REALTY [1. 0.]
CNT_CHILDREN [ 0  1  2  3  4  7  5  6  8  9 11 12 10 19 14]
AMT_INCOME_TOTAL [202500.  270000.   67500.  ... 936000.   96768.  113062.5]
AMT_CREDIT [ 406597.5 1293502.5  135000.  ... 1149408.   743863.5 1391130. ]
AMT_ANNUITY [24700.5 35698.5  6750.  ... 59544.  58770.  77809.5]
AMT_GOODS_PRICE [ 351000.  1129500.   135000.   513000.   454500.  1395000.  1530000.
  913500.   652500.    67500.   697500.   247500.   387000.   157500.
  927000.   450000.   225000.   702000.   270000.   675000.   477000.
  360000.   180000.   679500.   553500.   540000.   855000.   238500.
  252000.  1350000.   666000.   202500.   517500.  1588500.  1080000.
  810000.    90000.   315000.   900000.  1035000.   463500.   459000.
 1125000.   256500.   585000.   643500.   472500.   337500.   720000.
  396000.  1089000.   319500.  1435500.   688500. 

In [6]:
df['car_owned_less_10'] = df['car_owned_less_10'].astype(float)
df['car_owned_10_to_20'] = df['car_owned_10_to_20'].astype(float)
df['car_owned_20_to_30'] = df['car_owned_20_to_30'].astype(float)
df['car_owned_30'] = df['car_owned_30'].astype(float)
df['no_car'] = df['no_car'].astype(float)

In [7]:
df = df.drop(columns=["FLAG_MOBIL", "Unknown"]) #these columns now only have one value so they are useless for predicting --> remove them

In [8]:
#pd.value_counts(df["TARGET"])
not ((df["TARGET"] < 0) & (df["TARGET"] > 1)).any()

True

In [9]:
cols = ['no_car', 'Group of people', 'CNT_CHILDREN', 'Maternity leave', 'Widow', 'Co-op apartment', 'IT staff', 'SUNDAY', 'Industry: type 8', 'Academic degree']
df_tmp = df.drop(cols, axis=1)
# df_tmp['constant'] = np.ones(df.shape[0])
# print(np.linalg.matrix_rank(df_tmp))
# for i in df_tmp.columns:
#     # if len(pd.unique(df[i])) <= 1:
#     x_tmp = df_tmp.drop([i], axis=1)
#     print(i, np.linalg.matrix_rank(x_tmp))

In [10]:
# logistic regression
Xtrain = df_tmp.drop(columns=["TARGET", "SK_ID_CURR"], axis=1)
Xtrain = sm.add_constant(Xtrain, prepend=False)
ytrain = df_tmp["TARGET"]

log_reg = sm.Logit(ytrain, Xtrain.astype(float)).fit()

print(log_reg.summary())

         Current function value: 0.256978
         Iterations: 35




                           Logit Regression Results                           
Dep. Variable:                 TARGET   No. Observations:               264894
Model:                          Logit   Df Residuals:                   264744
Method:                           MLE   Df Model:                          149
Date:                Sun, 30 Apr 2023   Pseudo R-squ.:                 0.05610
Time:                        21:04:46   Log-Likelihood:                -68072.
converged:                      False   LL-Null:                       -72118.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
NAME_CONTRACT_TYPE                0.3562      0.033     10.930      0.000       0.292       0.420
CODE_GENDER                       0.3082      0.019     15.818      0.000 

In [12]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(class_weight='balanced', max_iter=10000)

Xtrain = df_tmp.drop(columns=["TARGET", "SK_ID_CURR"], axis=1)
# Xtrain = sm.add_constant(Xtrain, prepend=False)
ytrain = df_tmp["TARGET"]

logreg.fit(Xtrain, ytrain)

In [13]:
#logreg.score(Xtrain, ytrain)
f1 = f1_score(ytrain, logreg.predict(Xtrain))
f1

0.175071737717393

In [15]:
print(classification_report(ytrain, logreg.predict(Xtrain)))

              precision    recall  f1-score   support

           0       0.94      0.61      0.74    244404
           1       0.10      0.54      0.18     20490

    accuracy                           0.61    264894
   macro avg       0.52      0.58      0.46    264894
weighted avg       0.88      0.61      0.70    264894



In [7]:
# linear model 
x = sm.add_constant(Xtrain, prepend=False)
mod = sm.OLS(ytrain, x.astype(float))
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                 TARGET   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     54.32
Date:                Thu, 27 Apr 2023   Prob (F-statistic):               0.00
Time:                        23:34:08   Log-Likelihood:                -22180.
No. Observations:              264894   AIC:                         4.466e+04
Df Residuals:                  264742   BIC:                         4.626e+04
Df Model:                         151                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
NAME_CONTRACT_

In [87]:
X = Xtrain
y = ytrain
seed = 100
test_size = 0.25

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

# xgb_classifier.fit(X, y, sample_weight=classes_weights)


model = XGBClassifier(base_score=np.sum(y_train == 0)/len(y_train), max_depth=10)
model.fit(X_train, y_train, sample_weight=classes_weights)

In [88]:
y_pred = model.predict(X_test)

In [89]:
accuracy = accuracy_score(y_train, model.predict(X_train))
print("Train Accuracy: %.2f%%" % (accuracy * 100.0))

Train Accuracy: 90.06%


In [90]:
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: %.2f%%" % (accuracy * 100.0))

Test Accuracy: 80.65%


In [92]:
feat_dict= {}
for col, val in sorted(zip(X_train.columns, model.feature_importances_),key=lambda x:x[1],reverse=True):
  feat_dict[col]=val
feat_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})
feat_df

Unnamed: 0,Feature,Importance
0,Higher education,0.052926
1,car_owned_less_10,0.025108
2,REGION_RATING_CLIENT_W_CITY,0.020899
3,CODE_GENDER,0.019918
4,NAME_CONTRACT_TYPE,0.015859
...,...,...
144,Pensioner,0.000000
145,Unemployed,0.000000
146,Industry: type 13,0.000000
147,Industry: type 6,0.000000


In [93]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.85      0.89     61171
           1       0.15      0.31      0.20      5053

    accuracy                           0.81     66224
   macro avg       0.54      0.58      0.54     66224
weighted avg       0.88      0.81      0.84     66224



In [98]:
f1_score(y_train, model.predict(X_train), sample_weight=classes_weights)

0.9369179784775752

In [99]:
classes_weights_test = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_test
)
f1 = f1_score(y_test, y_pred, sample_weight=classes_weights_test)
f1

0.42801593279823114

In [95]:
y_pred_adjusted = []
for pred in model.predict_proba(X_test):
    if pred[1] >= 0.1:
        y_pred_adjusted.append(1)
    else:
        y_pred_adjusted.append(0)
print(classification_report(y_test, y_pred_adjusted))
f1_2 = f1_score(y_test, y_pred_adjusted)
print(f1_2)

              precision    recall  f1-score   support

           0       0.96      0.31      0.47     61171
           1       0.09      0.84      0.16      5053

    accuracy                           0.35     66224
   macro avg       0.53      0.57      0.32     66224
weighted avg       0.89      0.35      0.45     66224

0.16464740602671638


In [79]:
model.best_iteration

99

In [101]:
df_tmp.to_csv('clean2_data.csv')