In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load datasets
train_data = pd.read_csv('Assignment_Train.csv')
test_data = pd.read_csv('Assignment_Test.csv')

In [3]:
# Drop unnecessary columns
drop_columns = ['DEALER ID', 'APPLICATION LOGIN DATE', 'HDB BRANCH NAME', 'HDB BRANCH STATE',
                'FIRST NAME', 'MIDDLE NAME', 'LAST NAME', 'mobile', 'Personal Email Address',
                'Pan Name', 'name', 'vpa', 'upi_name', 'DEALER NAME', 'ASSET MODEL NO', 'PRIMARY ASSET MAKE', 'Primary Asset Model No']

In [4]:
train_data = train_data.drop(columns=drop_columns)
test_data = test_data.drop(columns=drop_columns)

In [5]:
# Separate numeric and categorical columns
numeric_columns = ['Cibil Score', 'TOTAL ASSET COST', 'APPLIED AMOUNT', 'AGE', 'phone_digitalage']
categorical_columns = ['AADHAR VERIFIED', 'MOBILE VERIFICATION', 'MARITAL STATUS', 'GENDER']

In [6]:
# Ensure numeric columns are properly converted to numeric data types
for col in numeric_columns:
    train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')

In [7]:
# Fill missing values
train_data[numeric_columns] = train_data[numeric_columns].fillna(train_data[numeric_columns].median())
test_data[numeric_columns] = test_data[numeric_columns].fillna(test_data[numeric_columns].median())
train_data[categorical_columns] = train_data[categorical_columns].fillna(train_data[categorical_columns].mode().iloc[0])
test_data[categorical_columns] = test_data[categorical_columns].fillna(test_data[categorical_columns].mode().iloc[0])

In [8]:
# Encode categorical columns
label_encoder = LabelEncoder()
for column in categorical_columns:
    train_data[column] = label_encoder.fit_transform(train_data[column])
    test_data[column] = label_encoder.transform(test_data[column])

In [9]:
# Define features and target
X = train_data.drop(columns=['Application Status'])
y = train_data['Application Status'].map({'APPROVED': 1, 'REJECTED': 0})

In [10]:
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Define the preprocessor
numerical_features = ['Cibil Score', 'TOTAL ASSET COST', 'APPLIED AMOUNT', 'AGE']
categorical_features = [col for col in X_train.columns if col not in numerical_features]

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [13]:
# Create and train the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [14]:
# Train the model
pipeline.fit(X_train, y_train)

ValueError: Input y contains NaN.

In [15]:
print(y_train.isnull().sum())

2650


In [17]:
# Ensure X_train and y_train are aligned
train_data = pd.concat([X_train, y_train], axis=1)
train_data = train_data.dropna(subset=['target_column_name'])  # Replace 'target_column_name' with your target column name

KeyError: ['target_column_name']

In [18]:
print(X_train.columns)  # Print columns of X_train
print(y_train.name)     # Print the name of y_train (if it's a Series)

Index(['AADHAR VERIFIED', 'Cibil Score', 'MOBILE VERIFICATION',
       'TOTAL ASSET COST', 'ASSET CTG', 'APPLIED AMOUNT', 'MARITAL STATUS',
       'GENDER', 'DOB', 'AGE', 'ADDRESS TYPE', 'EMPLOY CONSTITUTION',
       'EMPLOYER NAME', 'EMPLOYER TYPE', 'Phone Social Premium.a23games',
       'Phone Social Premium.amazon', 'Phone Social Premium.byjus',
       'Phone Social Premium.flipkart', 'Phone Social Premium.housing',
       'Phone Social Premium.indiamart', 'Phone Social Premium.instagram',
       'Phone Social Premium.isWABusiness',
       'Phone Social Premium.jeevansaathi', 'Phone Social Premium.jiomart',
       'Phone Social Premium.microsoft', 'Phone Social Premium.my11',
       'Phone Social Premium.paytm', 'Phone Social Premium.rummycircle',
       'Phone Social Premium.shaadi', 'Phone Social Premium.skype',
       'Phone Social Premium.toi', 'Phone Social Premium.whatsapp',
       'Phone Social Premium.yatra', 'Phone Social Premium.zoho',
       'phone_digitalage', 'phone_na

In [22]:
# Combine X_train and y_train into a single DataFrame
train_data = pd.concat([X_train, y_train], axis=1)

In [20]:
mode_value = y_train.mode()[0]
y_train.fillna(mode_value, inplace=True)

In [21]:
pipeline.fit(X_train, y_train)

In [24]:
# Predict on validation data
y_val_pred = pipeline.predict(X_val)

In [26]:
print(y_val.isna().sum())  # Check for missing values in y_val

673


In [28]:
y_val.fillna(y_val.mode()[0], inplace=True)  # Filling NaN with the most common value

In [29]:
from sklearn.metrics import accuracy_score, classification_report

In [37]:
# Truncate the longer array to match the shorter one
min_length = min(len(y_val), len(y_val_pred))

In [38]:
# Adjust both y_val and y_val_pred to have the same length
y_val = y_val[:min_length]
y_val_pred = y_val_pred[:min_length]

In [39]:
# Now calculate the accuracy
from sklearn.metrics import accuracy_score

In [40]:
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 1.0000


In [41]:
# Check a few values of y_val and y_val_pred to make sure they're different
print("True Labels:", y_val[:10])
print("Predicted Labels:", y_val_pred[:10])

True Labels: 4684    1.0
4742    1.0
6340    1.0
5202    1.0
6363    1.0
439     1.0
7487    1.0
5653    1.0
3999    1.0
6033    1.0
Name: Application Status, dtype: float64
Predicted Labels: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [42]:
print(y_val.value_counts())

Application Status
1.0    1327
Name: count, dtype: int64


In [43]:
print("Training Set Class Distribution:")
print(y_train.value_counts())

Training Set Class Distribution:
Application Status
1.0    8000
Name: count, dtype: int64


In [44]:
print("Validation Set Class Distribution:")
print(y_val.value_counts())

Validation Set Class Distribution:
Application Status
1.0    1327
Name: count, dtype: int64


In [45]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# Prepare predictions on test data
test_predictions = pipeline.predict(test_data)
test_data['Application Status'] = test_predictions
test_data[['UID', 'Application Status']].to_csv('predictions.csv', index=False)

In [49]:
print("Size of y_val:", len(y_val))
print("Size of y_val_pred:", len(y_val_pred))

Size of y_val: 2000
Size of y_val_pred: 1327
