<a href="https://colab.research.google.com/github/brunosavoca/ml-finalproject-um/blob/main/MAS_651_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score


In [None]:
app = pd.read_csv('card_application_record.csv')
credit = pd.read_csv('card_credit_record.csv')

In [None]:
# merging tables on column ID
df = pd.merge(app, credit, on='ID')

In [None]:
df.shape

In [None]:
# dropping column ID - not needed
df = df.drop(['ID'], axis=1)

In [None]:
df.head(10)

In [None]:
 # checking for NaN
 df.isna().sum()

In [None]:
# dropping column OCCUPATION_TYPE - too many NaN to be significant
df = df.drop('OCCUPATION_TYPE', axis=1)


In [None]:
 df.isna().sum()

In [None]:
df.columns


In [None]:
df['STATUS'].value_counts()

In [None]:
# redefine df with only some columns
df = df[['CODE_GENDER', 'AMT_INCOME_TOTAL', 'NAME_EDUCATION_TYPE', 'STATUS']]


In [None]:
df.tail(10)

In [None]:
df['NAME_EDUCATION_TYPE'].value_counts()

In [None]:
# create a dictionary to map categories to numbers
edu_map = {'Academic degree': 1, 'Lower secondary': 2, 'Secondary / secondary special': 3, 'Incomplete higher': 4, 'Higher education': 5}

# replace category names with numbers
df['EDUCATION_NUM'] = df['NAME_EDUCATION_TYPE'].replace(edu_map)

In [None]:
df['CODE_GENDER'].value_counts()

In [None]:
# create a dictionary to map categories to numbers
gender_map = {'F': 1, 'M': 0}

# replace category names with numbers
df['GENDER_NUM'] = df['CODE_GENDER'].replace(gender_map)

In [None]:
df['STATUS'].value_counts()

In [None]:
# create a dictionary to map categories to numbers
status_map = {'C': 6, 'X': 7}

# replace category names with numbers
df['STATUS'] = df['STATUS'].replace(status_map)

In [None]:
df = df[['GENDER_NUM','AMT_INCOME_TOTAL', 'EDUCATION_NUM', 'STATUS']]

In [None]:
df.head()

## Defining threshold

In [None]:
df.columns

In [None]:
df['GENDER_NUM'] = df['GENDER_NUM'].astype(int)
df['AMT_INCOME_TOTAL'] = df['AMT_INCOME_TOTAL'].astype(int)
df['EDUCATION_NUM'] = df['EDUCATION_NUM'].astype(int)
df['STATUS'] = df['STATUS'].astype(int)

In [None]:
df['STATUS'].value_counts()

In [None]:
# convert 'AMT_INCOME_TOTAL' column to float
df['AMT_INCOME_TOTAL'] = df['AMT_INCOME_TOTAL'].astype(float)

# convert 'EDUCATION_NUM' column to float
df['EDUCATION_NUM'] = df['EDUCATION_NUM'].astype(float)

# define thresholds
income_threshold = 70000
edu_threshold = 4
status_threshold = 5

# define scoring function
def score_row(row):
    income_score = (row['AMT_INCOME_TOTAL'] - income_threshold) / income_threshold
    edu_score = (row['EDUCATION_NUM'] - edu_threshold) / edu_threshold
    status_score = (status_threshold - row['STATUS']) / status_threshold
    
    score = (income_score + edu_score + status_score) / 3
        
    return score
# apply scoring function to each row
df['SCORE'] = df.apply(score_row, axis=1)

In [None]:
df.head(3)

In [None]:
df.describe()

*Scores* above 0.8 represent 'good clients', while scores below 0.8 represent 'bad clients'.

In [None]:
threshold_initial = 0.8 
df['APPROVED'] = df['SCORE'].apply(lambda x: 1 if x > threshold_initial else 0)

In [None]:
# 1 good client (approved), 0 bad client (not approved)
sns.countplot(x='APPROVED',data=df, palette='BuGn')
plt.show()
plt.savefig('count_plot')


In [None]:
# 0 = Male, 1 = Female; 0 = not approved, 1 = approved
table=pd.crosstab(df.GENDER_NUM,df.APPROVED)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar',
stacked=True, color=['teal','turquoise'] )
plt.title('Stacked Bar Chart of Gender vs Approval Status')
plt.xlabel('Gender')
plt.ylabel('Approval Status')
plt.savefig('Gender_Approval')

In [None]:
# 'Academic degree': 1, 'Lower secondary': 2, 'Secondary / secondary special': 3, 'Incomplete higher': 4, 'Higher education': 5; 0 = not approved, 1 = approved
table=pd.crosstab(df.EDUCATION_NUM,df.APPROVED)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar',
stacked=True, color=['teal','turquoise'] )
plt.title('Stacked Bar Chart of Education Level vs Approval Status')
plt.xlabel('Education Level')
plt.ylabel('Approval Status')
plt.savefig('Education_Approval')

In [None]:
# 0: 1-29 days past due, 1: 30-59 days past due, 2: 60-89 days overdue, 3: 90-119 days overdue, 4: 120-149 days overdue, 
# 5: Overdue or bad debts, write-offs for more than 150 days, 6: paid off that month, 7: No loan for the month

table=pd.crosstab(df.STATUS,df.APPROVED)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar',
stacked=True, color=['teal','turquoise'] )
plt.title('Stacked Bar Chart of Status vs Approval Status')
plt.xlabel('Status')
plt.ylabel('Approval Status')
plt.savefig('Status_Approval')

##MODEL SELECTION

**Logistic**

In [None]:
y = df['APPROVED']
X = df.drop(['APPROVED'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

**XGB**


In [None]:
import numpy as np

#overwriting 'SCORE' column to define a threshold

threshold = 1 # threshold value can be adjusted based on your data
df['PRE_APPROVED'] = df['SCORE'].apply(lambda x: 1 if x > threshold and np.random.rand() > 0.65 else 0)


In [None]:
df.head(3)

In [None]:
# create dummy variables
encoded_data = pd.get_dummies(df)
encoded_data.head()

In [None]:
X = df.drop(['PRE_APPROVED'], axis=1)
y = df['PRE_APPROVED']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)


In [None]:
import xgboost as xgb
model = xgb.XGBClassifier(max_depth=4,
                        subsample=0.9,
                        objective='binary:logistic',
                        n_estimators=200,
                        learning_rate = 0.1)

eval_set = [(X_train, y_train), (X_test, y_test)]

model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=True)

In [None]:

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
models = ['Logistic Regression', 'XGBoost Classifier']
accuracies = [0.8655, 0.9385]

c = ['Green', 'Lightgreen']

plt.bar(models, accuracies, color = c)
plt.title('Accuracy Results')

In [None]:
# fit the model on the training data
model.fit(X_train, y_train)

# get the feature importance
importance = model.feature_importances_

# get the names of the features
features = X_train.columns

# create a dataframe to store the feature importance values
df = pd.DataFrame({'features': features, 'importance': importance})

# sort the dataframe by importance
df = df.sort_values(by='importance', ascending=False)

# print the top 10 most influential variables
print(df.head(10))

In [None]:
!pip install lazypredict


In [None]:
# Import libraries
import pandas as pd
from tabulate import tabulate
from lazypredict.Supervised import LazyRegressor


# Initialize and fit the LazyRegressor model
#reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
#models, predictions = reg. fit(X_train, X_test, y_train, y_test)
# Print models
#print(tabulate(models, headers='keys', tablefmt='psql'))