# Aishwarya's Machine Learning Project

This notebook covers the steps involved in training and evaluating a machine learning model. The sections include data loading, preprocessing, model building, training, and evaluation. The goal is to build a model that can predict the target variable with good accuracy.



In [194]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

In [195]:
# Load the dataset
raw_data = pd.read_csv('loan.csv')

# Display the first few rows of the dataset
raw_data.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


The dataset is loaded from a CSV file and the first few rows are displayed to understand the structure of the data.


In [196]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


# Data Preprocessing

In [197]:
# Replace '3+' with '3' in the 'Dependents' column
raw_data['Dependents'].replace('3+', '3', inplace=True)

# Convert the 'Dependents' column to float type
raw_data['Dependents'] = raw_data['Dependents'].astype(float)


In [198]:
raw_data['Loan_Amount_Term'].mean()

342.0

In [199]:
# Calculate the mode for 'Gender' and fill missing values with it
gender_mode = raw_data['Gender'].mode()[0]
raw_data['Gender'].fillna(gender_mode, inplace=True)

# Calculate the mode for 'Married' and fill missing values with it
married_mode = raw_data['Married'].mode()[0]
raw_data['Married'].fillna(married_mode, inplace=True)

# Calculate the mean for 'Dependents' and fill missing values with it
dependents_mean = int(raw_data['Dependents'].mean())
raw_data['Dependents'].fillna(dependents_mean, inplace=True)

# Calculate the mode for 'Self_Employed' and fill missing values with it
self_employed_mode = raw_data['Self_Employed'].mode()[0]
raw_data['Self_Employed'].fillna(self_employed_mode, inplace=True)

# Calculate the mean for 'LoanAmount' and fill missing values with it
loanamount_mean = raw_data['LoanAmount'].mean()
raw_data['LoanAmount'].fillna(loanamount_mean, inplace=True)

# Calculate the mean for 'Loan_Amount_Term' and fill missing values with it
loanamountterm_mean = raw_data['Loan_Amount_Term'].mean()
raw_data['Loan_Amount_Term'].fillna(loanamountterm_mean, inplace=True)

# Calculate the mean for 'Credit_History' and fill missing values with it
credit_history_mean = raw_data['Credit_History'].mean()
raw_data['Credit_History'].fillna(credit_history_mean, inplace=True)



In [200]:
raw_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [201]:
raw_data.drop('Loan_ID', axis = 1, inplace = True)

In [202]:
df = raw_data.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    float64
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(5), int64(1), object(6)
memory usage: 57.7+ KB


In [203]:
# Map 'Y' to 1 and 'N' to 0 in the 'Loan_Status' column
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

# Extract the modified 'Loan_Status' column as the target variable y
y = df['Loan_Status']

# Drop the 'Loan_Status' column from the DataFrame df
df.drop('Loan_Status', axis=1, inplace=True)

# Display information about the DataFrame df
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    float64
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
dtypes: float64(5), int64(1), object(5)
memory usage: 52.9+ KB


In [204]:
# Initialize empty lists to store column names based on data type
categorical_cols = []
numerical_cols = []

# Iterate through each column in the DataFrame
for col in df.columns:
    # Check if the data type of the column is 'object' (typically strings in Pandas)
    if df[col].dtype == 'object':
        categorical_cols.append(col)  # Append column name to categorical_cols list if it's categorical
    else:
        numerical_cols.append(col)  # Append column name to numerical_cols list if it's numerical

# Print out the lists of column names
print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)


Categorical Columns: ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
Numerical Columns: ['Dependents', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']


In [205]:
encoded_df = pd.get_dummies(df[categorical_cols], drop_first=True, dtype = int)

# Step 3: Scale numerical data using StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled_data, columns=numerical_cols)

# Concatenate the encoded categorical data and scaled numerical data
X = pd.concat([encoded_df, scaled_df], axis=1)
X

Unnamed: 0,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,1,0,0,0,0,1,-0.737806,0.072991,-0.554487,0.000000,0.279851,0.451640
1,1,1,0,0,0,0,0.253470,-0.134412,-0.038732,-0.219273,0.279851,0.451640
2,1,1,0,1,0,1,-0.737806,-0.393747,-0.554487,-0.957641,0.279851,0.451640
3,1,1,1,0,0,1,-0.737806,-0.462062,0.251980,-0.314547,0.279851,0.451640
4,1,0,0,0,0,1,-0.737806,0.097728,-0.554487,-0.064454,0.279851,0.451640
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,0,-0.737806,-0.410130,-0.554487,-0.898095,0.279851,0.451640
610,1,1,0,0,0,0,2.236021,-0.212557,-0.554487,-1.267279,-2.518655,0.451640
611,1,1,0,0,0,1,0.253470,0.437174,-0.472404,1.269371,0.279851,0.451640
612,1,1,0,0,0,1,1.244745,0.357064,-0.554487,0.483367,0.279851,0.451640


In [206]:
X = X.values

In [207]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Split the data into train and test sets

# Build and Train the model

In [208]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer with sigmoid activation for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [209]:
history = model.fit(
    X_train, y_train,
    epochs=50,  # Example number of epochs
    batch_size=32,  # Example batch size
    validation_data=(X_test, y_test)
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [210]:
# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 0.5523
Test Accuracy: 0.7724
