In [None]:
%pip install scikit-learn

## Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import plotly_express as px
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

## Reading Dataset

In [None]:
data = pd.read_csv('Visa_For_Lisa_Loan_Modelling.csv')

## Dataset

In [None]:
data.drop('ID', axis=1, inplace=True)

In [None]:
data

## Statistics of dataset

In [None]:
print(data.info())
print()
print(data.describe())

In [None]:
data.isnull().sum()

## Visualizations

In [None]:
fig = px.imshow(data.corr(), text_auto=True, width=800, height=800)
fig.update_layout(title='Correlation')
fig.show()

In [None]:
fig = px.histogram(data, x='Age', y='Mortgage', barmode='group', text_auto=True)
fig.update_layout(title='Distribution of mortgage according to ages', height=400, width=1200)
fig.update_layout(xaxis_title='Ages', yaxis_title='Counts')
fig.show()

In [None]:
a = pd.cut(data['Education'], bins=3)
counts = a.value_counts().sort_index()
legend_names = ['Graduate', 'High School', 'Professional Degree']
fig = px.pie(names=legend_names, values=counts.values, title='Education Distribution')
fig.show()

In [None]:
loan = pd.cut(data['Personal Loan'], bins=2)
counts = loan.value_counts().sort_index()
legend_names = ['No', 'Yes']
fig = px.bar(x=legend_names, y=counts.values, title='Presence of Personal Loans', color=legend_names, text_auto=True)
fig.update_layout(xaxis_title='Personal Loan', yaxis_title='Counts')
fig.show()

In [None]:
credit_card_not = data['CreditCard'] == 0
credit_card_not = credit_card_not.sum()
credit_card_yes = data['CreditCard'] == 1
credit_card_yes = credit_card_yes.sum()
fig = px.pie(names=['Yes', 'No'], values=[credit_card_yes, credit_card_not], title='Distribution of Credit Card Usage')
fig.show()

In [None]:
loan = pd.cut(data['Securities Account'], bins=2)
counts = loan.value_counts().sort_index()
legend_names = ['No', 'Yes']
fig = px.bar(x=counts.values, y=legend_names, title='Distribution of Securities Account', color=legend_names, text_auto=True, orientation='h')
fig.update_layout(xaxis_title='Securities Account', yaxis_title='Counts')
fig.show()

In [None]:
data4 = data.copy()
value = data4['CCAvg'].sort_values(ascending=False).head(120)
color = data4['CCAvg'].astype(str).head(120)
data4['Family'] = data4['Family'].replace({1: 'Family(1 person)', 2:'Family(2 people)', 3: 'Family(3 people)', 4: 'Family(4 people)'}).head(120)
path_1 = data4['Family'].astype(str).head(120)

fig = px.sunburst(data4, path=[path_1, color], values=value, color=color)
fig.update_layout(title_text='Chart of Top 30 CCAvg Values for Each Family Category')
fig.show()

In [None]:
bins = [0, 5, 10, 20, 30, float('inf')]
labels = ['Less than 5', 'Between 5 and 10', 'Between 10 and 20', 'Between 20 and 30', 'More than 30']
data2 = data.copy()
data2['Experience'] = pd.cut(data['Experience'], bins=bins, labels=labels)
data2 = data2.groupby(['CreditCard', 'Experience'])['Income'].mean().reset_index()

fig = px.histogram(data2, x='Experience', y='Income', color='CreditCard', barmode='group', text_auto=True)
fig.update_layout(title='Statistics of clients by Experience and Credit Card', yaxis_title='Income')
fig.show()

In [None]:
data3 = data.copy()
data3['ZIP Code'] = data3['ZIP Code'].astype(str)
zip_counts = data3.groupby('ZIP Code').size().reset_index(name='Count')
zip_counts_sorted = zip_counts.sort_values(by='Count', ascending=False).head(30)
fig = px.bar(zip_counts_sorted, x='ZIP Code', y='Count', color='ZIP Code', text_auto=True)
fig.update_layout(title='Number of clients by Zip Code location')
fig.show()

## Machine Learning

In [None]:
X = data.drop('Personal Loan', axis=1)
y = data['Personal Loan']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def fit_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Model name: ', str(model))
    print(f'Accurance Score: {int(accuracy_score(y_test, y_pred) * 100)}%')
    print(f'Mean Squared Error: {round(mean_squared_error(y_test, y_pred), 3) * 100}%')
    print(f'Cross Validation Score: {int(np.mean(cross_val_score(model, X_train, y_train, cv=10)) * 100)}%')
    print(f'Confusion Matrix:')
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()

## Logistic Regression

In [None]:
model = LogisticRegression()
fit_model(model)

## Decision Tree Classifier

In [None]:
model = DecisionTreeClassifier()
fit_model(model)

## Random Forest Classifier

In [None]:
model = RandomForestClassifier()
fit_model(model)

## SVC

In [None]:
model = SVC()
fit_model(model)

## KNeighborsClassifier

In [None]:
model = KNeighborsClassifier()
fit_model(model)

## Linear Discriminant Analysis

In [None]:
model = LinearDiscriminantAnalysis()
fit_model(model)

## GaussianNB

In [None]:
model = GaussianNB()
fit_model(model)