# **Importing libraries**

In [453]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier


# **Loading Dataset**

In [454]:
df=pd.read_csv('/content/Assignment_Train.csv')
test=pd.read_csv('/content/Assignment_Test.csv')

In [455]:
df.head()

Unnamed: 0,DEALER ID,APPLICATION LOGIN DATE,HDB BRANCH NAME,HDB BRANCH STATE,FIRST NAME,MIDDLE NAME,LAST NAME,mobile,AADHAR VERIFIED,Cibil Score,...,Phone Social Premium.shaadi,Phone Social Premium.skype,Phone Social Premium.toi,Phone Social Premium.whatsapp,Phone Social Premium.yatra,Phone Social Premium.zoho,phone_digitalage,phone_nameMatchScore,phone_phoneFootprintStrengthOverall,Application Status
0,106989,07/20/2022,DELHI-SF,DELHI,SUNIL,,CHANDER,9210574080,NO,726.0,...,0.0,0.0,1.0,,,0.0,5324.0,67.222222,High,APPROVED
1,108975,07/28/2022,PATNA-SF,BIHAR,AMRIT,,KUMAR,8877987018,NO,,...,0.0,0.0,0.0,,,0.0,1998.0,100.0,High,APPROVED
2,111004,07/15/2022,DARJEELING-SF,WEST BENGAL,ANIMESH,,THAPA,8910862135,NO,737.0,...,0.0,0.0,0.0,,,0.0,-1.0,-1.0,Low,APPROVED
3,192020,07/04/22,SAHARANPUR-SF,UTTAR PRADESH,ADITYA,,SINGH,9758428017,NO,713.0,...,0.0,0.0,1.0,,,0.0,1998.0,72.777778,High,APPROVED
4,55095,07/15/2022,MODASA-SF,GUJARAT,PARMAR,HARESHBHAI,AMRUTBHAI,9687028486,NO,669.0,...,0.0,0.0,1.0,,,0.0,1998.0,68.095238,High,DECLINED


In [456]:
df.columns

Index(['DEALER ID', 'APPLICATION LOGIN DATE', 'HDB BRANCH NAME',
       'HDB BRANCH STATE', 'FIRST NAME', 'MIDDLE NAME', 'LAST NAME', 'mobile',
       'AADHAR VERIFIED', 'Cibil Score', 'MOBILE VERIFICATION', 'DEALER NAME',
       'TOTAL ASSET COST', 'ASSET CTG', 'ASSET MODEL NO', 'APPLIED AMOUNT',
       'PRIMARY ASSET MAKE', 'Primary Asset Model No',
       'Personal Email Address', 'MARITAL STATUS', 'GENDER', 'DOB', 'AGE',
       'ADDRESS TYPE', 'EMPLOY CONSTITUTION', 'EMPLOYER NAME', 'EMPLOYER TYPE',
       'Pan Name', 'name', 'vpa', 'upi_name', 'Phone Social Premium.a23games',
       'Phone Social Premium.amazon', 'Phone Social Premium.byjus',
       'Phone Social Premium.flipkart', 'Phone Social Premium.housing',
       'Phone Social Premium.indiamart', 'Phone Social Premium.instagram',
       'Phone Social Premium.isWABusiness',
       'Phone Social Premium.jeevansaathi', 'Phone Social Premium.jiomart',
       'Phone Social Premium.microsoft', 'Phone Social Premium.my11',
       

# **Indentifying Columns With Missing Value**

In [458]:
# Identify columns with missing values
missing_values = df.isnull().sum()

# Filter and display columns with missing values
missing_columns = missing_values[missing_values > 0]
missing_columns

Unnamed: 0,0
HDB BRANCH NAME,1
HDB BRANCH STATE,854
MIDDLE NAME,7145
LAST NAME,681
Cibil Score,4297
DEALER NAME,4
TOTAL ASSET COST,5108
ASSET CTG,5108
MARITAL STATUS,4894
ADDRESS TYPE,3312


# **Handling Missing values**

In [459]:
def handle_missing_values(df):
    # Convert 'Cibil Score' to numeric, forcing errors to NaN
    df['Cibil Score'] = pd.to_numeric(df['Cibil Score'], errors='coerce')

    # Fill missing categorical values with 'Unknown'
    categorical_df = df.select_dtypes(include=['object']).columns
    df[categorical_df] = df[categorical_df].fillna('Unknown')

    # Fill missing boolean (0/1) values and convert to int
    bool_features = [col for col in df.columns if col.startswith('Phone Social Premium.')]
    for feature in bool_features:
        df[feature] = df[feature].fillna(0).astype(int)

    # Impute missing integer and float values with the mean
    imputer = SimpleImputer(strategy='mean')
    df[df.select_dtypes(include=['int64', 'float64']).columns] = imputer.fit_transform(df.select_dtypes(include=['int64', 'float64']))

    return df

In [460]:
df=handle_missing_values(df)
test=handle_missing_values(test)

# **Label-Encoding**

In [461]:
#Converting categorical values to integer values
label_encoder = LabelEncoder()
categorical_df = df.select_dtypes(include=['object']).columns
for i in categorical_df:
   df[i] = label_encoder.fit_transform(df[i])

categorical_df = test.select_dtypes(include=['object']).columns
for i in categorical_df:
   test[i] = label_encoder.fit_transform(test[i])

# **Compute correlation with 'Application Status**

In [494]:
# Compute correlation with 'Application Status'
corr_with_status = df.corr()['Application Status'].sort_values(ascending=False)
corr_with_status

Unnamed: 0,Application Status
Application Status,1.0
MARITAL STATUS,0.6439848
EMPLOY CONSTITUTION,0.6163364
EMPLOYER TYPE,0.5919047
ASSET CTG,0.5571758
ADDRESS TYPE,0.314244
EMPLOYER NAME,0.207745
DEALER ID,0.04677303
Pan Name,0.04611008
vpa,0.04037244


# **Dropping Columns Which Are Not Correlated With Target Variable**

In [469]:
columns_to_drop = [
    'APPLICATION LOGIN DATE','AADHAR VERIFIED','MOBILE VERIFICATION','Phone Social Premium.a23games','Phone Social Premium.my11',
       'Phone Social Premium.rummycircle','Phone Social Premium.yatra'
]

# Check existing columns
existing_columns = [col for col in columns_to_drop if col in df.columns]

# Drop only existing columns
df.drop(columns=existing_columns, inplace=True)

In [482]:
# Dropping target and redundant features from the training and test set
X = df.drop(columns=['Application Status','DEALER ID'])
y = df['Application Status']
X_test = test.drop(columns=['DEALER ID', 'UID','APPLICATION LOGIN DATE','AADHAR VERIFIED','MOBILE VERIFICATION','Phone Social Premium.a23games','Phone Social Premium.my11',
       'Phone Social Premium.rummycircle','Phone Social Premium.yatra'])

In [483]:
# Split the data into training and validation sets (70-30 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)


# **Training**

In [484]:
# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier()
}

In [487]:
# Train and evaluate classifiers
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    print(f"{name}:\n")
    print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
    print(classification_report(y_val, y_pred))
    print("-" * 40)



Logistic Regression:

Accuracy: 0.6650
              precision    recall  f1-score   support

           0       0.67      1.00      0.80      1995
           1       0.00      0.00      0.00      1005

    accuracy                           0.67      3000
   macro avg       0.33      0.50      0.40      3000
weighted avg       0.44      0.67      0.53      3000

----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM:

Accuracy: 0.6650
              precision    recall  f1-score   support

           0       0.67      1.00      0.80      1995
           1       0.00      0.00      0.00      1005

    accuracy                           0.67      3000
   macro avg       0.33      0.50      0.40      3000
weighted avg       0.44      0.67      0.53      3000

----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest:

Accuracy: 0.8857
              precision    recall  f1-score   support

           0       0.92      0.91      0.91      1995
           1       0.82      0.84      0.83      1005

    accuracy                           0.89      3000
   macro avg       0.87      0.87      0.87      3000
weighted avg       0.89      0.89      0.89      3000

----------------------------------------
KNN:

Accuracy: 0.6203
              precision    recall  f1-score   support

           0       0.68      0.82      0.74      1995
           1       0.39      0.23      0.29      1005

    accuracy                           0.62      3000
   macro avg       0.53      0.52      0.51      3000
weighted avg       0.58      0.62      0.59      3000

----------------------------------------


# **Random Forest classifier performs the best**

In [491]:
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(
    n_estimators=100,        # Number of trees
    max_depth=None,          # Depth of each tree
    random_state=42,         # For reproducibility
    class_weight='balanced'  # Handles imbalanced classes
)

# Train the model
rf_model.fit(X_train, y_train)

# Predict class labels
y_pred = rf_model.predict(X_val)

# **Testing with Test Data**

In [492]:
prediction=rf_model.predict(X_test)
result = pd.DataFrame({'UID': test['UID'], 'Prediction': prediction})

In [493]:
result['Prediction'] = result['Prediction'].apply(lambda x: 'APPROVED' if x == 0 else 'DECLINED')
result.to_csv('predictions.csv', index=False)