In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from xgboost import plot_importance

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.utils import class_weight

In [2]:
df = pd.read_csv("../data/cleaned_data_ext.csv", index_col=0)
df_orig = df

df = df[df["CODE_GENDER"] != "XNA"] # filter out gender = XNA bc only 4 entries
categorical = ["NAME_CONTRACT_TYPE", "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY"] # change categorical variables to 0, 1 encoding
for col in categorical:
  fst = df[col].iloc[0]
  df[col] = (df[col] == fst).astype(float)

df = df.drop(columns=["OWN_CAR_AGE"]) # drop this because we have the new categorical variables for this info

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = (df[col] == fst).astype(float)


In [3]:
#impute external scores with mean
ext_one_mean = df["EXT_SOURCE_1"].mean()
ext_two_mean = df["EXT_SOURCE_2"].mean()
ext_three_mean = df["EXT_SOURCE_3"].mean()

df["EXT_SOURCE_1"].fillna(value=ext_one_mean, inplace=True)
df["EXT_SOURCE_2"].fillna(value=ext_two_mean, inplace=True)
df["EXT_SOURCE_3"].fillna(value=ext_three_mean, inplace=True)

In [4]:
# drop entries with a nan
df = df.dropna(axis=0)

In [5]:
# # figure out which columns only have one value
# for i in df.columns:
#     # if len(pd.unique(df[i])) <= 1:
#     print(i, pd.unique(df[i]))

In [6]:
df['car_owned_less_10'] = df['car_owned_less_10'].astype(float)
df['car_owned_10_to_20'] = df['car_owned_10_to_20'].astype(float)
df['car_owned_20_to_30'] = df['car_owned_20_to_30'].astype(float)
df['car_owned_30'] = df['car_owned_30'].astype(float)
df['no_car'] = df['no_car'].astype(float)

In [7]:
df = df.drop(columns=["FLAG_MOBIL", "Unknown"]) #these columns now only have one value so they are useless for predicting --> remove them

In [8]:
#pd.value_counts(df["TARGET"])
not ((df["TARGET"] < 0) & (df["TARGET"] > 1)).any()

True

In [9]:
cols = ['no_car', 'Group of people', 'CNT_CHILDREN', 'Maternity leave', 'Widow', 'Co-op apartment', 'IT staff', 'SUNDAY', 'Industry: type 8', 'Academic degree']
df_tmp = df.drop(cols, axis=1)
# df_tmp['constant'] = np.ones(df.shape[0])
# print(np.linalg.matrix_rank(df_tmp))
# for i in df_tmp.columns:
#     # if len(pd.unique(df[i])) <= 1:
#     x_tmp = df_tmp.drop([i], axis=1)
#     print(i, np.linalg.matrix_rank(x_tmp))

In [10]:
df_tmp

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,Trade: type 4,Trade: type 5,Trade: type 6,Trade: type 7,Transport: type 1,Transport: type 2,Transport: type 3,Transport: type 4,University,XNA
0,100002,1,1.0,1.0,1.0,1.0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0,0,0,0,0,0
1,100003,0,1.0,0.0,1.0,0.0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0,0,0,0,0,0
2,100004,0,0.0,1.0,0.0,1.0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0,0,0,0,0,0
4,100007,0,1.0,1.0,1.0,1.0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0,0,0,0,0,0
5,100008,0,1.0,1.0,1.0,1.0,99000.0,490495.5,27517.5,454500.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307503,456247,0,1.0,0.0,1.0,1.0,112500.0,345510.0,17770.5,247500.0,...,0,0,0,0,0,0,0,0,0,0
307505,456249,0,1.0,0.0,1.0,1.0,112500.0,225000.0,22050.0,225000.0,...,0,0,0,0,0,0,0,0,0,1
307508,456253,0,1.0,0.0,1.0,1.0,153000.0,677664.0,29979.0,585000.0,...,0,0,0,0,0,0,0,0,0,0
307509,456254,1,1.0,0.0,1.0,1.0,171000.0,370107.0,20205.0,319500.0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,Trade: type 4,Trade: type 5,Trade: type 6,Trade: type 7,Transport: type 1,Transport: type 2,Transport: type 3,Transport: type 4,University,XNA
0,100002,1,1.0,1.0,1.0,1.0,0,202500.0,406597.5,24700.5,...,0,0,0,0,0,0,0,0,0,0
1,100003,0,1.0,0.0,1.0,0.0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0,0,0,0,0,0
2,100004,0,0.0,1.0,0.0,1.0,0,67500.0,135000.0,6750.0,...,0,0,0,0,0,0,0,0,0,0
4,100007,0,1.0,1.0,1.0,1.0,0,121500.0,513000.0,21865.5,...,0,0,0,0,0,0,0,0,0,0
5,100008,0,1.0,1.0,1.0,1.0,0,99000.0,490495.5,27517.5,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# normalize columns that are not one hot
for i in df_tmp.columns[1:]:
    if set(pd.unique(df_tmp[i])) != {0, 1}: #0 or 1
      df_tmp[i] = (df_tmp[i] - df_tmp[i].min()) / (df_tmp[i].max() - df_tmp[i].min())
df_tmp.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,Trade: type 4,Trade: type 5,Trade: type 6,Trade: type 7,Transport: type 1,Transport: type 2,Transport: type 3,Transport: type 4,University,XNA
0,100002,1,1.0,1.0,1.0,1.0,0.001508,0.090287,0.090032,0.077441,...,0,0,0,0,0,0,0,0,0,0
1,100003,0,1.0,0.0,1.0,0.0,0.002085,0.311736,0.132924,0.271605,...,0,0,0,0,0,0,0,0,0,0
2,100004,0,0.0,1.0,0.0,1.0,0.000354,0.022472,0.020025,0.023569,...,0,0,0,0,0,0,0,0,0,0
4,100007,0,1.0,1.0,1.0,1.0,0.000816,0.116854,0.078975,0.117845,...,0,0,0,0,0,0,0,0,0,0
5,100008,0,1.0,1.0,1.0,1.0,0.000623,0.111235,0.101018,0.103255,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_tmp.to_csv('cleaned_data_ext_norm.csv')

In [27]:
# logistic regression
Xtrain = df_tmp.drop(columns=["TARGET", "SK_ID_CURR"], axis=1)
Xtrain = sm.add_constant(Xtrain, prepend=False)
ytrain = df_tmp["TARGET"]

log_reg = sm.Logit(ytrain, Xtrain.astype(float)).fit()

print(log_reg.summary())

         Current function value: 0.240917
         Iterations: 35




                           Logit Regression Results                           
Dep. Variable:                 TARGET   No. Observations:               264894
Model:                          Logit   Df Residuals:                   264741
Method:                           MLE   Df Model:                          152
Date:                Tue, 02 May 2023   Pseudo R-squ.:                  0.1151
Time:                        22:39:05   Log-Likelihood:                -63817.
converged:                      False   LL-Null:                       -72118.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
NAME_CONTRACT_TYPE                0.3050      0.033      9.168      0.000       0.240       0.370
CODE_GENDER                       0.2787      0.020     13.974      0.000 

In [28]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(class_weight='balanced', max_iter=10000)

Xtrain = df_tmp.drop(columns=["TARGET", "SK_ID_CURR"], axis=1)
# Xtrain = sm.add_constant(Xtrain, prepend=False)
ytrain = df_tmp["TARGET"]

logreg.fit(Xtrain, ytrain)

LogisticRegression(class_weight='balanced', max_iter=10000)

In [29]:
#logreg.score(Xtrain, ytrain)
f1 = f1_score(ytrain, logreg.predict(Xtrain))
f1

0.17506004708642817

In [30]:
print(classification_report(ytrain, logreg.predict(Xtrain)))

              precision    recall  f1-score   support

           0       0.94      0.61      0.74    244404
           1       0.10      0.54      0.18     20490

    accuracy                           0.61    264894
   macro avg       0.52      0.58      0.46    264894
weighted avg       0.88      0.61      0.70    264894



In [31]:
# linear model 
x = sm.add_constant(Xtrain, prepend=False)
mod = sm.OLS(ytrain, x.astype(float))
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                 TARGET   R-squared:                       0.064
Model:                            OLS   Adj. R-squared:                  0.064
Method:                 Least Squares   F-statistic:                     119.5
Date:                Tue, 02 May 2023   Prob (F-statistic):               0.00
Time:                        22:39:34   Log-Likelihood:                -17434.
No. Observations:              264894   AIC:                         3.517e+04
Df Residuals:                  264741   BIC:                         3.678e+04
Df Model:                         152                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
NAME_CONTRACT_

In [32]:
X = Xtrain
y = ytrain
seed = 100
test_size = 0.25

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

# xgb_classifier.fit(X, y, sample_weight=classes_weights)


model = XGBClassifier(base_score=np.sum(y_train == 0)/len(y_train), max_depth=10)
model.fit(X_train, y_train, sample_weight=classes_weights)

XGBClassifier(base_score=0.9222982835858459, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [33]:
y_pred = model.predict(X_test)

In [34]:
accuracy = accuracy_score(y_train, model.predict(X_train))
print("Train Accuracy: %.2f%%" % (accuracy * 100.0))

Train Accuracy: 91.46%


In [35]:
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: %.2f%%" % (accuracy * 100.0))

Test Accuracy: 83.10%


In [36]:
feat_dict= {}
for col, val in sorted(zip(X_train.columns, model.feature_importances_),key=lambda x:x[1],reverse=True):
  feat_dict[col]=val
feat_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})
feat_df

Unnamed: 0,Feature,Importance
0,EXT_SOURCE_3,0.022662
1,CODE_GENDER,0.018162
2,EXT_SOURCE_2,0.016051
3,Higher education,0.014845
4,Emergency,0.014568
...,...,...
147,Pensioner,0.000000
148,Student,0.000000
149,Unemployed,0.000000
150,Industry: type 13,0.000000


In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.87      0.90     61171
           1       0.20      0.39      0.26      5053

    accuracy                           0.83     66224
   macro avg       0.57      0.63      0.58     66224
weighted avg       0.89      0.83      0.86     66224



In [38]:
f1_score(y_train, model.predict(X_train), sample_weight=classes_weights)

0.9447229189235699

In [39]:
classes_weights_test = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_test
)
f1 = f1_score(y_test, y_pred, sample_weight=classes_weights_test)
f1

0.5127225402960816

In [40]:
y_pred_adjusted = []
for pred in model.predict_proba(X_test):
    if pred[1] >= 0.1:
        y_pred_adjusted.append(1)
    else:
        y_pred_adjusted.append(0)
print(classification_report(y_test, y_pred_adjusted))
f1_2 = f1_score(y_test, y_pred_adjusted)
print(f1_2)

              precision    recall  f1-score   support

           0       0.97      0.41      0.57     61171
           1       0.10      0.83      0.18      5053

    accuracy                           0.44     66224
   macro avg       0.54      0.62      0.38     66224
weighted avg       0.90      0.44      0.54     66224

0.18493511048754824


In [41]:
model.best_iteration

99

In [42]:
df_tmp.to_csv('clean2_data.csv')