In [2]:
import pandas as pd
import numpy as np
import warnings
import plotly.express as px
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix

In [3]:
def load_dataset(url):
    return pd.read_csv(url)

dataset = load_dataset('Visa_For_Lisa_Loan_Modelling.csv')

In [4]:
def summarize_dataset(df):
    print(f"Dataset shape: ")
    df.shape
    print("\n\nDataset's summary:")
    df.info()
    print(f"First 10 rows of the dataset \n{df.head(10)}")
summarize_dataset(dataset)

Dataset shape: 


Dataset's summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB
First 10 rows of th

In [5]:
def clean_dataset(df):
    try:
        df.drop('ID', axis=1, inplace=True)
    except:
        pass
    # df['Experience'] = df['Experience'].abs()

    return df
dataset = clean_dataset(dataset)
dataset

Unnamed: 0,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,92697,1,1.9,3,0,0,0,0,1,0
4996,30,4,15,92037,4,0.4,1,85,0,0,0,1,0
4997,63,39,24,93023,2,0.3,3,0,0,0,0,0,0
4998,65,40,49,90034,3,0.5,2,0,0,0,0,1,0


In [6]:
def correlation_function(df):
    fig = px.imshow(df.corr().abs(), text_auto=True)
    fig.update_layout(height=800, width=1000)
    return fig.show()


In [7]:
correlation_function(dataset)

In [8]:
def securities_accounts(dataset):
    fig = px.pie(dataset['Securities Account'].value_counts(),
                 names=dataset['Securities Account'].value_counts().index,
                 values=dataset['Securities Account'].value_counts().values,
                 title='Statistics of clients with Securities Account\t1 - Yes\t 0 - No',
                 labels={'labels':'Securities Account', 'values':'Count'},
                 color_discrete_sequence=px.colors.qualitative.Set1,
                 width=600
                 )
    fig.show()

In [9]:
securities_accounts(dataset)

In [11]:
def experience_income_relative_personal_lone(dataset):
    fig = px.box(dataset, x='Experience', y='Income', color='Personal Loan', facet_col='Personal Loan',title = 'Customer Experience and Income related to Personal Loan')
    fig.show()

In [12]:
experience_income_relative_personal_lone(dataset)

In [13]:
def CCAvg_Income_relative_personal_loan(dataset):
    fig = px.scatter(dataset, y = 'CCAvg', x = 'Income', color='Personal Loan',facet_col='Personal Loan',
                     title = 'Customer CCAvg and Income related to Personal Loan' , width = 1070)
    fig.show()

In [14]:
CCAvg_Income_relative_personal_loan(dataset)

In [15]:
def MortgageAge_personal_loan(dataset):

    fig = px.scatter(dataset, y='Mortgage', x='Age',facet_col='Personal Loan' ,color='Personal Loan',
                     title = 'Mortgage and Age related to Personal Loan')
    fig.show()


In [16]:
MortgageAge_personal_loan(dataset)

In [17]:
def experience_edu(dataset):
    names = dataset['Education'].replace({1: 'Bachelor', 2: 'Master', 3: 'Phd'})
    edu_counts = names.value_counts().reset_index()
    edu_counts.columns = ['Education', 'Count']
    fig = px.bar(edu_counts, x='Education', y='Count', color='Education',title = 'The level of education of the clients', width=800,text_auto=True)
    fig.show()

In [18]:
experience_edu(dataset)

Machine Learning Part

In [89]:
X = dataset.drop('CD Account', axis=1)
Y = dataset['CD Account']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [194]:
def print_fit_model(model):
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    print('Model name:', str(model))
    print(f'Accurance score: {int(metrics.accuracy_score(Y_test, y_pred) * 100)}%')
    print(f"Mean squared error: {round(metrics.mean_squared_error(Y_test, y_pred), 2)}")
    print(f"Cross Validation Score: {int(np.mean(cross_val_score(model, X_train, Y_train, cv=5)) * 100)}%")
    confusion_m = metrics.confusion_matrix(Y_test, y_pred)
    fig = px.imshow(confusion_m , text_auto=True,width=400,height=400,title='Confussion matrix')
    fig.show()
    

In [195]:
model = LogisticRegression()
print_fit_model(model)

Model name: LogisticRegression()
Accurance score: 93%
Mean squared error: 0.06
Cross Validation Score: 94%


In [196]:
model = MLPClassifier(hidden_layer_sizes=(10, 10), activation='relu')
print_fit_model(model)

Model name: MLPClassifier(hidden_layer_sizes=(10, 10))
Accurance score: 93%
Mean squared error: 0.06
Cross Validation Score: 90%


In [197]:
model = GaussianNB()
print_fit_model(model)

Model name: GaussianNB()
Accurance score: 93%
Mean squared error: 0.06
Cross Validation Score: 93%


In [198]:
model = ExtraTreesClassifier(n_estimators=170, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=2, max_features='sqrt', bootstrap=False, random_state=42)
print_fit_model(model)

Model name: ExtraTreesClassifier(min_samples_leaf=2, n_estimators=170, random_state=42)
Accurance score: 97%
Mean squared error: 0.02
Cross Validation Score: 97%


In [199]:
model = GradientBoostingClassifier(n_estimators=150)
print_fit_model(model)

Model name: GradientBoostingClassifier(n_estimators=150)
Accurance score: 97%
Mean squared error: 0.02
Cross Validation Score: 97%


In [200]:
model = RandomForestClassifier(n_estimators=100,        
    criterion='gini',       
    max_depth=None,          
    min_samples_split=2,    
    min_samples_leaf=1,      
    max_features='sqrt',              
    random_state=42 )
print_fit_model(model)

Model name: RandomForestClassifier(random_state=42)
Accurance score: 97%
Mean squared error: 0.02
Cross Validation Score: 97%


In [201]:
model = DecisionTreeClassifier(criterion="gini", max_depth=None, min_samples_split=20, min_samples_leaf=40, max_features=None, random_state=52)
print_fit_model(model)

Model name: DecisionTreeClassifier(min_samples_leaf=40, min_samples_split=20,
                       random_state=52)
Accurance score: 97%
Mean squared error: 0.03
Cross Validation Score: 95%


In [202]:
model = KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',metric_params=None, n_jobs=10,
n_neighbors=6, p=20,weights='uniform')
print_fit_model(model)

Model name: KNeighborsClassifier(n_jobs=10, n_neighbors=6, p=20)
Accurance score: 93%
Mean squared error: 0.06
Cross Validation Score: 93%
