In [1]:
!pip install pandas matplotlib seaborn scikit-learn



In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display

In [2]:
url = "https://storage.googleapis.com/qwasar-public/track-ds/Visa_For_Lisa_Loan_Modelling.csv"
df = pd.read_csv(url)


URLError: <urlopen error [Errno 11001] getaddrinfo failed>

In [None]:
def clean_dataset(dataset):
    try:
        dataset.drop('ID', axis=1, inplace=True)
    except:
        pass

def summarize_data(dataset):
    print(f"Dataset shape: {dataset.shape}")
    print("\nSample 10 rows from dataset:")
    display(dataset.sample(10, random_state=31))

    print("\nDescribe the dataset:")
    display(dataset.describe())

    print("\nNaN values count:")
    display(pd.DataFrame({"Column": dataset.isna().sum().index, "Count": dataset.isna().sum().values}))

def plot_income_distribution(dataset, hue=None):
    sns.displot(data=dataset, x='Income', hue=hue, kde=True, multiple='stack', height=6, aspect=2)
    plt.title("Income Distribution")
    plt.show()

    print()

    sns.displot(data=dataset, x='CCAvg', hue=hue, kde=True, multiple='stack', height=6, aspect=2)
    plt.title("CCAvg Distribution")
    plt.show()

def plot_credit_card_usage(dataset, hue=None):
    sns.displot(data=dataset, x='Education', hue=hue, kde=True, multiple='stack', height=6, aspect=2)

    plt.xticks(np.arange(3) + 1, ['undergraduate', 'graduate', 'advancedl'], rotation=90)
    plt.title("Bank users use credit cards")

    plt.show()

def show_correlation_heatmap(dataset):
    plt.figure(figsize=(12, 12))
    sns.heatmap(data=dataset.corr(), square=True, annot=True, cmap="coolwarm", linewidths=0.1)
    plt.title("Dataset correlation")
    plt.show()

def plot_customer_insights(dataset):
    figure, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(30, 9))

    palette_color = sns.color_palette('bright')

    ax1.set_title("Personal Loan Approval")
    ax1.pie(dataset['Personal Loan'].value_counts(), labels=['Not Approved', 'Approved'], colors=['red', 'green'], autopct='%.0f%%', shadow=True)

    ax2.set_title("Customers Education Level")
    ax2.pie(dataset['Education'].value_counts(), labels=['Undergraduate', 'Graduate', 'Advanced/Professional'], colors=palette_color, autopct='%.0f%%', shadow=True)

    ax3.set_title("Credit Card Usage")
    ax3.pie(dataset['CreditCard'].value_counts(), labels=['No Usage', 'Usage'], colors=palette_color, autopct='%.0f%%', shadow=True)

    plt.show()

def plot_income_scatter(dataset, hue=None):
    figure, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 7))

    ax1.set_title("Experience vs Income")
    sns.scatterplot(data=dataset, x='Experience', y='Income', hue=hue, ax=ax1)

    ax2.set_title("Age vs Income")
    sns.scatterplot(data=dataset, x='Age', y='Income', hue=hue, ax=ax2)

    ax3.set_title("Income vs Mortgage")
    sns.scatterplot(data=dataset, y='Income', x='Mortgage', hue=hue, ax=ax3)

    plt.show()

def plot_pair_relationships(dataset, hue=None, columns=None):
    sns.pairplot(dataset[columns], hue=hue, kind='kde')
    plt.title("Some are contradictory")
    plt.show()

def transform_personal_loan(dataset):
    hue = dataset['Personal Loan'].astype(str).replace({'1': 'Given', '0': 'Not Given'})
    return hue

def visualize_histograms(dataset, figsize=(20, 16), bins=100):
    dataset.hist(figsize=figsize, bins=bins)
    plt.show()


In [None]:
clean_dataset(df)
summarize_data(df)
transformed_personal_loan = transform_personal_loan(df)
plot_income_distribution(df, hue=transformed_personal_loan)
plot_credit_card_usage(df, hue=transformed_personal_loan)
show_correlation_heatmap(df)
plot_customer_insights(df)
plot_income_scatter(df, hue=transformed_personal_loan)
real_columns = ['Income', 'CCAvg', 'CD Account', 'Education', 'Mortgage', 'Personal Loan']
test_columns = ['Income', 'Personal Loan']
plot_pair_relationships(df, hue='Personal Loan', columns=real_columns)
visualize_histograms(df)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import classification_report, accuracy_score, classification_report, mean_squared_error, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
class ModelPerformance:
    def __init__(self, model):
        self.model = model
        self.model_name = str(model)
        self.accuracy_score = None
        self.cross_validation_scores = None
        self.mean_cross_validation_score = None
        self.mean_squared_error = None
        self.confusion_matrix = None

    def display_metrics(self):
        print(f"Model: {self.model_name}")
        print(f"Accuracy Score: {int(self.accuracy_score * 100)}%")
        print(f"Cross-Validation Score: {int(self.mean_cross_validation_score * 100)}%")
        print(f"Mean Squared Error: {self.mean_squared_error}")
        matrix = self.confusion_matrix
        print(f"Confusion Matrix:\n{matrix[0][0]} {matrix[0][1]} \n{matrix[1][0]}  {matrix[1][1]}\n\n")

class MachineLearningModel:
    def __init__(self, models, x=None, y=None, test_size=0.2):
        self.models = models
        self.fitted_models = []
        self.performances = []
        self.X = x
        self.Y = y
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(x, y, test_size=test_size)

    def train_models(self):
        for model in self.models:
            model.fit(self.X_train, self.y_train)
            self.fitted_models.append(model)

        return self.fitted_models

    def evaluate_models(self):
        for model in self.fitted_models:
            y_predict = model.predict(self.X_test)

            performance = ModelPerformance(model)
            performance.accuracy_score = accuracy_score(self.y_test, y_predict)
            performance.cross_validation_scores = cross_val_score(model, self.X, self.Y, cv=10)
            performance.mean_cross_validation_score = np.mean(performance.cross_validation_scores)
            performance.mean_squared_error = mean_squared_error(self.y_test, y_predict)
            performance.confusion_matrix = confusion_matrix(self.y_test, y_predict)

            self.performances.append(performance)
        return self.performances

    def display_model_metrics(self):
        for performance in self.performances:
            performance.display_metrics()

    def get_performance(self, model_name):
        for performance in self.performances:
            if performance.model_name == model_name:
                return performance
        return None

    def get_top_models(self, n):
        sorted_models = sorted(self.performances, key=lambda x: x.accuracy_score, reverse=True)
        return sorted_models[:n]


In [None]:
regresssions_and_classifiers = [
    LogisticRegression(max_iter=3000),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    SVC(),
    KNeighborsClassifier()
]

x_df_data = df[['Income', 'CCAvg', 'CD Account', 'Education', 'Mortgage']]
y_df_data = df['Personal Loan']

ml_model = MachineLearningModel(regresssions_and_classifiers, x_df_data, y_df_data)
ml_model.train_models()
ml_model.evaluate_models()
ml_model.display_model_metrics()


In [None]:
import joblib

for performance in ml_model.get_top_models(2):
    model = performance.model
    filename = str(performance.model_name).split('(')[0].strip('Classifier').strip('Regression') + '.joblib'

    with open(filename, 'w+b') as file:
        joblib.dump(model, filename)
        print(model, "saved as", filename)

In [None]:
import glob

loaded_models = []

for model_path in glob.glob('./*.joblib'):
    with open(model_path, 'r+b') as file:
        loaded_models.append(joblib.load(file))

for model in loaded_models:
    print(model)
