In [25]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import plotly.express as px
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 

In [152]:
def load_dataset(url):
    return pd.read_csv(url)

df = load_dataset('Visa_For_Lisa_Loan_Modelling.csv')

In [27]:
def summarize_dataset(df):
    print(f"Dataset's shape: {df.shape} \n\nDataset's summary:")
    df.info()
    print(f"First 10 rows of the dataset \n{df.head(10)}")
summarize_dataset(df)

Dataset's shape: (5000, 14) 

Dataset's summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB
First 1

In [28]:
def clean_dataset(df):
    try:
        df.drop('ID', axis=1, inplace=True)
    except:
        pass
    df['Experience'] = df['Experience'].abs()

    return df
df = clean_dataset(df)

In [29]:
def corr(df: pd.core.frame.DataFrame):
    # corrilation
    fig = px.imshow(df.corr().abs(), text_auto=True)
    fig.update_layout(height=900, width=900)
    return fig.show()

In [30]:
def exp_card(df):
    bins = [0, 5, 10, 20, 30, float('inf')]
    labels = ['Less than 5', 'Between 5 and 10', 'Between 10 and 20', 'Between 20 and 30', 'More 30']
    df2 = df.copy()
    df2['Experience'] = pd.cut(df['Experience'], bins=bins, labels=labels)
    df2 = df2.groupby(['CreditCard', 'Experience'])['Income'].mean().reset_index()
    
    fig = px.histogram(df2, x='Experience', y='Income', color='CreditCard', barmode='group', text_auto=True)
    fig.update_layout(title='Statistics of clients by Experience and Credit Card', yaxis_title='Income')
    return fig.show()

In [31]:
def zip(df):
    df2 = df['ZIP Code'].value_counts().reset_index().head(30)
    fig = px.bar(df2, x=df2['ZIP Code'].astype(str), y='count', color='count', text_auto=True)
    fig.update_layout(title='Number of clients by Zip Code location', height=600, width=1200)
    return fig.show()

In [78]:
def edu_online(df):
    fig = px.pie(df, names=df['Education'].replace({1: 'Bacherlor', 2: 'Master', 3: 'Phd'}), values='Online', title='Customers which has degree and uses online services of the bank')
    fig.update_layout(legend_title_text='Degrees')
    return fig.show()

In [122]:
bins = [18, 40, 60, float('inf')]
labels = ['Young-aged', 'Middle-aged', 'Elderly']
df3 = df.copy()
df3['Age'] = pd.cut(df3['Age'], bins=bins, labels=labels)
def age_card(df):
    fig = px.histogram(df3, x='Age', color='CreditCard', barmode='group', text_auto=True)
    return fig.show()

In [140]:
def sunburst():
    df2 = df3.groupby('Age')['Experience'].mean().astype(int).to_frame()
    df2 = pd.concat([df2, df3.groupby('Age')['CreditCard'].sum()], axis=1)
    df4 = df3.groupby('Age')['CreditCard'].value_counts().to_frame()
    
    df5 = df3.groupby(['Age', 'CreditCard']).mean()
    df5['Count'] = df4['count']
    df5['Experience'] = df5['Experience'].astype(int)
    fig = px.sunburst(df5.reset_index(), path=['Age', 'Experience', 'CreditCard'], values='Count')
    return fig.show()

In [141]:
def card_count(df):
    df2 = df.groupby('CreditCard')['Mortgage'].sum()
    df2.rename(index={0: 'No', 1: 'Yes'}, inplace=True)
    fig = px.bar(df2, x=df2.index, y=df2.values, color=df2.values, text_auto=True)
    fig.update_layout(height=400, width=400, yaxis_title='Total mortgage amount', title='The statistics of CreditCard status')
    return fig.show()

In [142]:
def visualize(df):
    corr(df)
    card_count(df)
    exp_card(df)
    zip(df)
    edu_online(df)
    age_card(df)
    sunburst()
visualize(df)

# Prediction part

In [143]:
X = df.drop('Personal Loan', axis=1)
y = df['Personal Loan']

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=32, test_size=0.4)

In [145]:
def fit_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Model name:', str(model))
    print(f'Accurance score: {int(metrics.accuracy_score(y_test, y_pred) * 100)}%')
    print(f"Mean squared error: {round(metrics.mean_squared_error(y_test, y_pred), 2)}")
    print(f"Cross Validation Score: {int(np.mean(cross_val_score(model, X_train, y_train, cv=10)) * 100)}%")
    print(f"Confusion Matrix: \n{metrics.confusion_matrix(y_test, y_pred)}")

In [146]:
model = LogisticRegression()
fit_model(model)

Model name: LogisticRegression()
Accurance score: 93%
Mean squared error: 0.06
Cross Validation Score: 93%
Confusion Matrix: 
[[1768   28]
 [  93  111]]


In [147]:
model = DecisionTreeClassifier(max_depth=50)
fit_model(model)

Model name: DecisionTreeClassifier(max_depth=50)
Accurance score: 98%
Mean squared error: 0.02
Cross Validation Score: 97%
Confusion Matrix: 
[[1784   12]
 [  20  184]]


In [148]:
model = RandomForestClassifier()
fit_model(model)

Model name: RandomForestClassifier()
Accurance score: 98%
Mean squared error: 0.01
Cross Validation Score: 98%
Confusion Matrix: 
[[1796    0]
 [  24  180]]


In [149]:
model = SVC()
fit_model(model)

Model name: SVC()
Accurance score: 89%
Mean squared error: 0.1
Cross Validation Score: 90%
Confusion Matrix: 
[[1796    0]
 [ 204    0]]


In [150]:
model = LinearDiscriminantAnalysis()
fit_model(model)

Model name: LinearDiscriminantAnalysis()
Accurance score: 93%
Mean squared error: 0.07
Cross Validation Score: 94%
Confusion Matrix: 
[[1752   44]
 [  92  112]]


In [151]:
model = KNeighborsClassifier(n_neighbors=5)
fit_model(model)

Model name: KNeighborsClassifier()
Accurance score: 89%
Mean squared error: 0.11
Cross Validation Score: 90%
Confusion Matrix: 
[[1747   49]
 [ 162   42]]
