<a href="https://colab.research.google.com/github/aishvarya252/Loan_prediction/blob/main/Model_Predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight

### 1. Load the Training Data ###
train_data = pd.read_csv('Assignment_Train.csv')

# Define UID column for test data
UID_col = 'UID'

# Drop irrelevant columns in training data
irrelevant_cols_train = ['DEALER ID', 'APPLICATION LOGIN DATE', 'HDB BRANCH NAME', 'HDB BRANCH STATE',
                         'FIRST NAME', 'MIDDLE NAME', 'LAST NAME', 'mobile', 'Personal Email Address',
                         'Pan Name', 'name', 'vpa', 'upi_name']
train_data = train_data.drop(columns=irrelevant_cols_train, errors='ignore')

### 2. Data Preprocessing ###
# Handle missing values

# Numerical columns: fill missing values using the median
num_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
imputer_num = SimpleImputer(strategy='median')
train_data[num_cols] = imputer_num.fit_transform(train_data[num_cols])

# Categorical columns: fill missing values using the most frequent value
cat_cols = train_data.select_dtypes(include=['object']).columns
cat_cols = cat_cols.drop('Application Status', errors='ignore')  # Don't include 'Application Status' here
imputer_cat = SimpleImputer(strategy='most_frequent')
train_data[cat_cols] = imputer_cat.fit_transform(train_data[cat_cols])

# Encode the target variable 'Application Status' using LabelEncoder
label_encoder = LabelEncoder()
train_data['Application Status'] = label_encoder.fit_transform(train_data['Application Status'])

# One-Hot Encode categorical variables for consistency
train_data = pd.get_dummies(train_data, columns=cat_cols, drop_first=True)

### 3. Feature Selection ###
# Separate the features (X) and the target variable (y)
X = train_data.drop(columns=['Application Status'], errors='ignore')
y = train_data['Application Status']

# Handle class imbalance by adjusting class weights
class_weights = compute_class_weight('balanced', classes=[0, 1], y=y)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### 4. Model Training ###
# Train a RandomForestClassifier with class weights
rf_model = RandomForestClassifier(random_state=42, class_weight=class_weight_dict)
rf_model.fit(X_train, y_train)

### 5. Model Evaluation on Validation Set ###
y_pred_val = rf_model.predict(X_val)

# Evaluate the model
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val))
print("Classification Report:\n", classification_report(y_val, y_pred_val))




Validation Accuracy: 0.8515
Confusion Matrix:
 [[1075  252]
 [  45  628]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.81      0.88      1327
           1       0.71      0.93      0.81       673

    accuracy                           0.85      2000
   macro avg       0.84      0.87      0.84      2000
weighted avg       0.88      0.85      0.86      2000



In [2]:
# Check feature importance
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_model.feature_importances_})
print(feature_importances.sort_values(by='Importance', ascending=False))

                                              Feature  Importance
5494                        EMPLOYER NAME_agriculture    0.119744
1                                    TOTAL ASSET COST    0.097068
2708                                  ASSET CTG_MCECA    0.061872
3007                            ADDRESS TYPE_Parental    0.032882
5                                                 AGE    0.026251
...                                               ...         ...
4350   EMPLOYER NAME_MOHD SHAHBAZ ( CLOTHES SUPPLYER)    0.000000
4351             EMPLOYER NAME_MOHD SHARIK FRUIT SHOP    0.000000
4354                 EMPLOYER NAME_MOIN GENERAL STORE    0.000000
4355                 EMPLOYER NAME_MOLLA GARMENT SHOP    0.000000
3356  EMPLOYER NAME_BRG GOYAL PRAVATE LIMITED COMPANY    0.000000

[6713 rows x 2 columns]


In [3]:

# Optional: Use Cross-Validation to validate performance
cross_val_scores = cross_val_score(rf_model, X, y, cv=5)
print("Cross-Validation Scores:", cross_val_scores)

Cross-Validation Scores: [0.8425 0.849  0.8475 0.8365 0.85  ]


In [4]:


### 6. Load and Preprocess the Test Data ###
test_data = pd.read_csv('Assignment_Test.csv')

# Save the UID column separately for the final output
test_UID = test_data[UID_col]

# Drop irrelevant columns from the test data
test_data = test_data.drop(columns=irrelevant_cols_train, errors='ignore')

# Apply the imputers to the test data
num_cols_test = num_cols.intersection(test_data.columns)  # Ensure numerical columns exist in test data
cat_cols_test = [col for col in cat_cols if col in test_data.columns]  # Ensure categorical columns exist in test data

# Fill missing values for numerical and categorical columns
test_data[num_cols_test] = imputer_num.transform(test_data[num_cols_test])
test_data[cat_cols_test] = imputer_cat.transform(test_data[cat_cols_test])

# One-Hot Encode categorical variables in test data to match training data
test_data = pd.get_dummies(test_data, columns=cat_cols_test, drop_first=True)

# Ensure test data has the same columns as training data
missing_cols_in_test = set(X.columns) - set(test_data.columns)
for col in missing_cols_in_test:
    test_data[col] = 0  # Add missing columns with 0 values

X_test = test_data[X.columns]  # Ensure column order matches training data



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_data[col] = 0  # Add missing columns with 0 values
  test_

In [6]:
### 7. Make Predictions on Test Data ###
test_predictions = rf_model.predict(X_test)

In [7]:
# Convert predictions back to original label names
test_predictions_labels = label_encoder.inverse_transform(test_predictions)

### 8. Save the Predictions to CSV File ###
output = pd.DataFrame({UID_col: test_UID, 'Prediction': test_predictions_labels})
output.to_csv('predictions.csv', index=False)

print("Predictions saved to 'predictions.csv'")

Predictions saved to 'predictions.csv'


In [8]:
print(output.head())

                   UID Prediction
0  1844045271814558464   APPROVED
1  1840349097823778816   APPROVED
2  1488102613362294272   APPROVED
3   555529923942874624   DECLINED
4  1010213070486150912   APPROVED
