In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split

# Reading Data

In [None]:
df = pd.read_csv('/kaggle/input/analytics-vidhya-loan-prediction/train.csv')

# Exploring Data

In [None]:
df.head()# Will give you first 5 records

In [None]:
#gain insights of your data set and see general information about each feature and the total number of non-null values in each variable
df.info()

- Well, as a first note, we can see that we are not so lucky, because we have missing values in some columns like __Gender, Married, Dependent, Self-Employed, LoanAmount, Loan_Amount_Term, and Credit_History__

# Data Cleaning

In [None]:
df_clean = df.copy()

In [None]:
df_clean.drop('Loan_ID',axis=1 ,inplace=True)

## Check Missing Values

In [None]:
df_clean.isnull().sum()

In [None]:
df_clean.isnull().mean() * 100


### Check Duplicated

In [None]:
df_clean.duplicated().sum()

# Data Visualisation  

- The term Univariate Analysis refers to the analysis of only one variable. The aim is to analyse and find out patterns specific to a single variable

In [None]:
cat_cols = df_clean.select_dtypes(include='object').columns.tolist()
cat_cols

In [None]:
for col in cat_cols:
    print(df_clean[col].value_counts())
    print('------------------------------------------------------------')

In [None]:
plt.figure(figsize = (50 , 100))
for i , col in enumerate (cat_cols  ,1) :
    plt.subplot((len(cat_cols)//1) , 1 , i )
    sns.histplot(data = df_clean , x = col)
    plt.xticks(fontsize=40)
    plt.yticks(fontsize=40)
    plt.xlabel(col, fontsize=80)

In [None]:
num_cols = df_clean.select_dtypes(exclude = 'O').columns.tolist()
num_cols

In [None]:
plt.figure(figsize = (50 , 100))
for i , col in enumerate (num_cols ,1) :
    plt.subplot((len(num_cols)//1) , 1 , i )
    sns.histplot(data = df_clean , x = col)
    plt.xticks(fontsize=40)
    plt.yticks(fontsize=40)
    plt.xlabel(col, fontsize=80)

In [None]:
def plot_countplot(df_clean,cols):
    plt.figure(figsize=(20,20))
    for i,col in enumerate(cols):
        plt.subplot(3,3,i+1)
        sns.countplot(x=col,data=df_clean,hue='Loan_Status')
        plt.title(f'{col}',size=10,loc='right')
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        plt.xlabel(col, fontsize=40)
    plt.tight_layout()
    plt.show()

plot_countplot(df_clean,cat_cols)

# Data Splitting

In [None]:
# Data Splitting into features and target
X = df_clean.drop('Loan_Status', axis=1)
y = df_clean['Loan_Status']

# Splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
categorical_cols = list(df_clean.select_dtypes(include=['object']).columns)
categorical_cols.remove('Loan_Status')
categorical_cols

In [None]:
numerical_cols = list(df_clean.select_dtypes(include=['int64', 'float64']).columns)
numerical_cols

In [None]:
y_train.value_counts(normalize = True )


In [None]:
y_test.value_counts(normalize = True )


In [None]:
!pip install datasist


In [None]:
from datasist.structdata import detect_outliers 


In [None]:
idx = detect_outliers(X_train , 0 , ['ApplicantIncome','CoapplicantIncome','LoanAmount'])
out = X_train.loc[idx]
X_train = X_train.drop(idx  , axis=0)
y_train = y_train.drop(idx  , axis=0)

In [None]:
X_train.shape


In [None]:
y_train.shape


# Data Preprocessing

In [None]:
from category_encoders import BinaryEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[('binary', BinaryEncoder())])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History']),
        ('cat', categorical_transformer, ['Gender','Married','Dependents','Education','Self_Employed','Property_Area'])])

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Models Bulding

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
lr = LogisticRegression(random_state=0)
lr.fit(X_train_preprocessed , y_train)
y_pred = lr.predict(X_test_preprocessed)
print(classification_report(y_test, y_pred))

In [None]:
lr2 = LogisticRegression(random_state=0 , class_weight = 'balanced')
lr2.fit(X_train_preprocessed , y_train)
y_pred = lr2.predict(X_test_preprocessed)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train_preprocessed , y_train)
y_pred = dt.predict(X_test_preprocessed)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=9 ,random_state=42, max_depth=23)
rf.fit(X_train_preprocessed , y_train)
y_pred = rf.predict(X_test_preprocessed)
print(classification_report(y_test, y_pred))

In [None]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X_train_preprocessed, y_train)
print(sorted(Counter(y_resampled).items()))

In [None]:
lr.fit(X_resampled , y_resampled)
y_pred = dt.predict(X_test_preprocessed)
print(classification_report(y_test, y_pred))

In [None]:
dt.fit(X_resampled , y_resampled)
y_pred = dt.predict(X_test_preprocessed)
print(classification_report(y_test, y_pred))

In [None]:
rf.fit(X_resampled , y_resampled)
y_pred = dt.predict(X_test_preprocessed)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
parameters = {
    'n_estimators':range(8,12),
    'max_depth': range(15,25),
}
rf = RandomForestClassifier()
RCV = RandomizedSearchCV(estimator=rf, 
                   param_distributions=parameters, 
                   n_iter=3,
                   cv=3)

In [None]:
RCV.fit(X_train_preprocessed,y_train)


In [None]:
RCV.best_params_


In [None]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE().fit_resample(X_train_preprocessed, y_train)
print(sorted(Counter(y_resampled).items()))

In [None]:
lr.fit(X_resampled , y_resampled)
y_pred = dt.predict(X_test_preprocessed)
print(classification_report(y_test, y_pred))

In [None]:
dt.fit(X_resampled , y_resampled)
y_pred = dt.predict(X_test_preprocessed)
print(classification_report(y_test, y_pred))

In [None]:
rf.fit(X_resampled , y_resampled)
y_pred = dt.predict(X_test_preprocessed)
print(classification_report(y_test, y_pred))

In [None]:
from imblearn.pipeline import Pipeline
over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler()
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
x_pipe, y_pipe = pipeline.fit_resample(X_train_preprocessed, y_train)
Counter(y_pipe)

In [None]:
lr.fit(x_pipe , y_pipe)
y_pred = lr.predict(X_test_preprocessed)
print(classification_report(y_test, y_pred))