In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from xgboost import plot_importance

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("cleaned_data.csv", index_col=0)

df = df[df["CODE_GENDER"] != "XNA"] # filter out gender = XNA bc only 4 entries
categorical = ["NAME_CONTRACT_TYPE", "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY"] # change categorical variables to 0, 1 encoding
for col in categorical:
  fst = df[col].iloc[0]
  df[col] = (df[col] == fst).astype(float)

df = df.drop(columns=["OWN_CAR_AGE"]) # drop this because we have the new categorical variables for this info

In [3]:
# take average of external scores, only 172 users have no scores
df["avg_ex_score"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
df["avg_ex_score"]

0         0.161787
1         0.466757
2         0.642739
3         0.650442
4         0.322738
            ...   
307506    0.413601
307507    0.115992
307508    0.499536
307509    0.587593
307510    0.518984
Name: avg_ex_score, Length: 307507, dtype: float64

In [4]:
df = df.drop(columns=["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"])

In [5]:
# drop entries with a nan
df = df.dropna(axis=0)

In [6]:
# figure out which columns only have one value
for i in df.columns:
    if len(pd.unique(df[i])) <= 1:
        print(i, pd.unique(df[i]))

FLAG_MOBIL [1]
Unknown [0]


In [7]:
df = df.drop(columns=["FLAG_MOBIL", "Unknown"]) #these columns now only have one value so they are useless for predicting --> remove them

In [8]:
# logistic regression
Xtrain = df.drop(columns=["TARGET", "SK_ID_CURR"], axis=1)
ytrain = df["TARGET"]

log_reg = sm.Logit(ytrain, Xtrain.astype(float)).fit()

print(log_reg.summary())

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


         Current function value: inf
         Iterations: 35


LinAlgError: Singular matrix

In [9]:
# linear model 
x = sm.add_constant(Xtrain, prepend=False)
mod = sm.OLS(ytrain, x.astype(float))
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                 TARGET   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.061
Method:                 Least Squares   F-statistic:                     115.1
Date:                Fri, 28 Apr 2023   Prob (F-statistic):               0.00
Time:                        15:44:45   Log-Likelihood:                -17743.
No. Observations:              264806   AIC:                         3.579e+04
Df Residuals:                  264653   BIC:                         3.740e+04
Df Model:                         152                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
NAME_CONTRACT_

In [10]:
X = Xtrain
y = ytrain
seed = 100
test_size = 0.25

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [11]:
y_pred = model.predict(X_test)

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 92.30%


In [13]:
feat_dict= {}
for col, val in sorted(zip(X_train.columns, model.feature_importances_),key=lambda x:x[1],reverse=True):
  feat_dict[col]=val
feat_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})
feat_df

Unnamed: 0,Feature,Importance
0,avg_ex_score,0.052305
1,NAME_CONTRACT_TYPE,0.016993
2,State servant,0.014364
3,Higher education,0.013812
4,car_owned_less_10,0.013052
...,...,...
155,Services,0.000000
156,Telecom,0.000000
157,Trade: type 5,0.000000
158,University,0.000000


In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     61120
           1       0.46      0.02      0.04      5082

    accuracy                           0.92     66202
   macro avg       0.69      0.51      0.50     66202
weighted avg       0.89      0.92      0.89     66202



In [15]:
f1 = f1_score(y_test, y_pred)
f1

0.041361158112427146

In [16]:
y_pred_adjusted = []
for pred in model.predict_proba(X_test):
    if pred[1] >= 0.1:
        y_pred_adjusted.append(1)
    else:
        y_pred_adjusted.append(0)
print(classification_report(y_test, y_pred_adjusted))
f1_2 = f1_score(y_test, y_pred_adjusted)
print(f1_2)

              precision    recall  f1-score   support

           0       0.96      0.79      0.86     61120
           1       0.18      0.56      0.27      5082

    accuracy                           0.77     66202
   macro avg       0.57      0.67      0.57     66202
weighted avg       0.90      0.77      0.82     66202

0.27017744705208935
