## Assignment 1
### Name:   Bannuru Rohit Kumar Reddy
### Roll Number:    21CS30011

In [None]:
# import all the necessary libraries here
import pandas as pd

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

In [None]:
df = pd.read_csv('../../dataset/cross-validation.csv')
print(df.shape) 

#### Analysing the data 

In [None]:
# Printing the basic information about the data

print(df.head())
print(df.dtypes)
print(df.describe())

##### Check for Missing Values

In [None]:
df.isnull().sum()

###### Since the number of missing values is not too large, We will drop all the rows which have missing values 

In [None]:
df = df.dropna()
print(df.shape)

##### Separating the data into the input feautres and the output feature 
###### Note : We are removing the column 'Loan_ID' from the set of input features as the id will not affect in the prediction of loan status

In [None]:
X_df = df.drop(['Loan_Status','Loan_ID'], axis=1)
X_df_withloanID = df.drop(['Loan_Status'], axis=1)
y_df = df['Loan_Status']

print(X_df.shape)
print(y_df.shape)

##### Dealing with categorical data 
###### We will first have to convert the categorical data into numerical columns before we can train our model, as the model can only take numbers as input

In [None]:
# Changing the output variable into categories : 

y_df = y_df.replace('N', 0)  # Replacing N with 0
y_df = y_df.replace('Y', 1)  # Replacing Y with 0

y_df.head()

# Changing the input feature variables into categories : 

# Columns to be encoded into integers :  Gender, Married, Education, Self_Employed, Property_Area

# Changing gender column to 0 and 1 here
X_df['Gender'] = X_df['Gender'].apply(lambda x: 0 if x == 'Male' else 1)

# Changing married column to 0 and 1 here
X_df['Married'] = X_df['Married'].apply(lambda x: 1 if x == 'Yes' else 0)

# Changing education column to 0 and 1 here
X_df['Education'] = X_df['Education'].apply(lambda x: 0 if x == 'Graduate' else 1)

# Changing self employed column to 0 and 1 here
X_df['Self_Employed'] = X_df['Self_Employed'].apply(lambda x: 0 if x == 'Yes' else 1)

# encoding the property area column here
X_final = pd.get_dummies(X_df, columns=['Property_Area'])

# Preprocess the 'Dependents' column to convert '3+' to a numeric value
X_final['Dependents'] = X_final['Dependents'].replace('3+', 3).astype(float)


##### Applying the Standard Scalar on the numerical data 

In [None]:
scaler = StandardScaler()
X_final[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Dependents']] = scaler.fit_transform(X_final[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Dependents']])
X_final.head()

In [None]:
import numpy as np

# Create an array of shuffled indexes
data_indexes = np.arange(len(X_final))
np.random.shuffle(data_indexes)

# Define the number of folds for cross-validation
num_folds = 5

# Calculate the size of each fold
fold_size = len(data_indexes) // num_folds

# Initialize lists to store evaluation metrics
fold_accuracies = []
fold_precisions = []
fold_recalls = []

# Perform k-fold cross-validation
for fold_num in range(num_folds):
    # Determine the current fold's start and end indexes
    start_idx = fold_num * fold_size
    end_idx = (fold_num + 1) * fold_size if fold_num < (num_folds - 1) else len(data_indexes)
    
    # Extract the current fold's indexes
    current_fold_indexes = data_indexes[start_idx:end_idx]
    
    # Create training and testing sets based on the fold indexes
    training_indexes = [idx for idx in data_indexes if idx not in current_fold_indexes]
    X_train, y_train = X_final.iloc[training_indexes], y_df.iloc[training_indexes]
    X_test, y_test = X_final.iloc[current_fold_indexes], y_df.iloc[current_fold_indexes]
    
    # Train your model (replace this with your model training code)
    # For example, you can use a Logistic Regression model
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics (replace this with your evaluation code)
    # For example, you can calculate accuracy, precision, and recall
    fold_accuracy = np.mean(y_pred == y_test)
    fold_precision = np.sum((y_pred == 1) & (y_test == 1)) / np.sum(y_pred == 1)
    fold_recall = np.sum((y_pred == 1) & (y_test == 1)) / np.sum(y_test == 1)
    
    # Append evaluation metrics to the respective lists
    fold_accuracies.append(fold_accuracy)
    fold_precisions.append(fold_precision)
    fold_recalls.append(fold_recall)

# Calculate and print the mean evaluation metrics across all folds
mean_fold_accuracy = np.mean(fold_accuracies)
mean_fold_precision = np.mean(fold_precisions)
mean_fold_recall = np.mean(fold_recalls)

# Print the mean evaluation metrics
print(f"Mean Accuracy: {mean_fold_accuracy:.4f}")
print(f"Mean Precision: {mean_fold_precision:.4f}")
print(f"Mean Recall: {mean_fold_recall:.4f}")
