In [32]:
import streamlit as st
import pandas as pd
import numpy as np
from  ydata_profiling import ProfileReport
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy import stats
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [3]:
df = pd.read_csv(r'C:\Users\Hp\Desktop\dataset\Streamlit\Financial_inclusion_dataset.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
report_Financial_inclusion = ProfileReport(df, title = 'Financial_inclusion_datas')
report_Financial_inclusion

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
plot = make_subplots

In [None]:
def Plots(df) :
    figure = make_subplots(rows=4, cols=4)
    for i, column in enumerate(df.columns) :
        row = (i//4) + 1
        col = (i%4) + 1
        figure.add_trace(go.Box(y=df[column], name= column),row = row, col = col)
    figure.update_layout(width = 1000, showlegend = False)
    figure.show()

Plots(df)
 



In [19]:
le = LabelEncoder()

In [27]:
df_encoded = df.copy()

In [28]:
df_encoded['country'] = le.fit_transform(df_encoded['country'])
df_encoded['uniqueid'] = le.fit_transform(df_encoded['uniqueid'])
df_encoded['bank_account'] = le.fit_transform(df_encoded['bank_account'])
df_encoded['location_type'] = le.fit_transform(df_encoded['location_type'])
df_encoded['cellphone_access'] = le.fit_transform(df_encoded['cellphone_access'])
df_encoded['gender_of_respondent'] = le.fit_transform(df_encoded['gender_of_respondent'])
df_encoded['relationship_with_head'] = le.fit_transform(df_encoded['relationship_with_head'])
df_encoded['marital_status'] = le.fit_transform(df_encoded['marital_status'])
df_encoded['education_level'] = le.fit_transform(df_encoded['education_level'])
df_encoded['job_type'] = le.fit_transform(df_encoded['job_type'])

In [None]:
# encode data :
df_encoded.info()

In [None]:
#Heatmap to check correlations 
correlation = sns.heatmap(df_encoded.corr(), annot=True, cmap='coolwarm')
correlation

In [None]:
# Z-scores for each feature
df_z_scores = df_encoded.apply(stats.zscore)

# Identify outliers (Z-score > 3 or < -3)
outliers = (df_z_scores > 3) | (df_z_scores < -3)
#store outliers in df_outliers
df_outliers = df_z_scores[outliers.any(axis=1)]
#drop outliers from data with ~ operator 
df_cleaned = df_z_scores[~outliers.any(axis=1)]

df_cleaned.info()

In [None]:
def Plots(df_cleaned) :
    figure = make_subplots(rows=4, cols=4)
    for i, column in enumerate(df_cleaned.columns) :
        row = (i//4) + 1
        col = (i%4) + 1
        figure.add_trace(go.Box(y=df_cleaned[column], name= column),row = row, col = col)
    figure.update_layout(width = 1000, showlegend = False)
    figure.show()

Plots(df_cleaned)
 

In [None]:
#  histogram :
def create_histograms(df_cleaned, columns, nbins=10, title='Financial inclusion african countries'):
    rows = (len(columns) + 2) // 3
    fig = make_subplots(rows=rows, cols=3, subplot_titles=columns)
    for i, column in enumerate(columns):
        fig_hist = px.histogram(df, x=column, nbins=nbins)
        row = (i // 3) + 1
        col = (i % 3) + 1
        fig.add_trace(fig_hist['data'][0], row=row, col=col)
    fig.update_layout(height=rows*300, title_text=title, showlegend=False)
    
    fig.show()

create_histograms(df_cleaned, columns=df_cleaned.columns, nbins=5)

**Data exploration train and test a machine learning classifier**

In [None]:
# with df_encoded : 
X = df_encoded.drop(columns=['bank_account'])
y = df_encoded['bank_account']
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

**KNN**

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(X_train,y_train)

In [50]:
y_pred = knn.predict(X_test)

In [None]:
accuracy_knn = accuracy_score(y_test, y_pred) 
precision_knn = precision_score(y_test, y_pred) 
recall_knn = recall_score(y_test, y_pred) 
f1_knn = f1_score(y_test, y_pred) 
conf_matrix_knn = confusion_matrix(y_test, y_pred)
print(f'accuracy : {accuracy_knn}')
print(f'Precision : {precision_knn}')
print(f'recall : {recall_knn}')
print(f'f1 : {f1_knn}')
print(f'Confusion Matrix {conf_matrix_knn}')

In [None]:
sns.heatmap(conf_matrix_knn, annot=True)

**Decision Tree**

In [None]:
X1 = df_encoded.drop(columns=['bank_account'])
y1 = df_encoded['bank_account']
X1_train, X1_test, y1_train,y1_test = train_test_split(X1,y1, test_size=0.2, random_state=42)
X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape

In [None]:
decision_tree = DecisionTreeClassifier(max_depth=10)
decision_tree.fit(X1_train,y1_train)

In [54]:
y1_pred = decision_tree.predict(X1_test)

In [None]:
accuracy1_tree = accuracy_score(y1_test, y1_pred) 
precision_tree = precision_score(y1_test, y1_pred) 
recall_tree = recall_score(y1_test, y1_pred) 
f1_tree = f1_score(y1_test, y1_pred) 
conf_matrix_tree = confusion_matrix(y1_test, y1_pred)
print(f'accuracy1 : {accuracy1_tree}')
print(f'Precision : {precision_tree}')
print(f'recall : {recall_tree}')
print(f'f2 : {f1_tree}')
print(f'Confusion Matrix {conf_matrix_tree}')

In [None]:
sns.heatmap(conf_matrix_tree, annot=True)

**Random Forest**

In [None]:
X2= df_encoded.drop(columns=['bank_account'])
y2 = df_encoded['bank_account']
X2_train, X2_test, y2_train,y2_test = train_test_split(X2,y2, test_size=0.2, random_state=42)
X2_train.shape, X2_test.shape, y2_train.shape, y2_test.shape

In [None]:
random_forest = RandomForestClassifier(n_estimators=10)
random_forest.fit(X2_train, y2_train)

In [65]:
y2_pred = random_forest.predict(X2_test)

In [None]:
accuracy_forest = accuracy_score(y2_test, y2_pred) 
precision_forest = precision_score(y2_test, y2_pred) 
recall_forest = recall_score(y2_test, y2_pred) 
f1_forest = f1_score(y2_test, y2_pred) 
conf_matrix_forest = confusion_matrix(y2_test, y2_pred)
print(f'accuracy : {accuracy_forest}')
print(f'Precision : {precision_forest}')
print(f'recall : {recall_forest}')
print(f'f2 : {f1_forest}')
print(f'Confusion Matrix {conf_matrix_forest}')

In [None]:
sns.heatmap(conf_matrix_forest, annot=True)

**Grid Search**

In [None]:
X3= df_encoded.drop(columns=['bank_account'])
y3 = df_encoded['bank_account']
X3_train, X3_test, y3_train,y3_test = train_test_split(X3,y3, test_size=0.2, random_state=42)
X3_train.shape, X3_test.shape, y3_train.shape, y3_test.shape

In [69]:
model_grid_search = DecisionTreeClassifier()
param_grid = {'criterion': ['gini', 'entropy', 'log_loss'],
              'max_depth' : [None,10,20,30],
              'min_samples_split' : [2,5,10]}
grid_search = GridSearchCV(estimator= model_grid_search, param_grid=param_grid, cv=5, scoring='accuracy')

In [70]:
y_pred3 = grid_search.fit(X3_train, y3_train)

In [None]:
best_params = grid_search.best_params_ 
best_score = grid_search.best_score_ 
print("Best Parameters:", best_params) 
print("Best Cross-Validation Accuracy:", best_score)