In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Load datasets
train_data = pd.read_csv('Assignment_Train.csv')
test_data = pd.read_csv('Assignment_Test.csv')

In [3]:
# Drop unnecessary columns
drop_columns = ['DEALER ID', 'APPLICATION LOGIN DATE', 'HDB BRANCH NAME', 'HDB BRANCH STATE',
                'FIRST NAME', 'MIDDLE NAME', 'LAST NAME', 'mobile', 'Personal Email Address',
                'Pan Name', 'name', 'vpa', 'upi_name', 'DEALER NAME', 'ASSET MODEL NO', 'PRIMARY ASSET MAKE', 'Primary Asset Model No']

In [4]:
train_data = train_data.drop(columns=drop_columns)
test_data = test_data.drop(columns=drop_columns)

In [5]:
# Separate numeric and categorical columns
numeric_columns = ['Cibil Score', 'TOTAL ASSET COST', 'APPLIED AMOUNT', 'AGE', 'phone_digitalage']
categorical_columns = ['AADHAR VERIFIED', 'MOBILE VERIFICATION', 'MARITAL STATUS', 'GENDER']

In [6]:
# Ensure numeric columns are properly converted to numeric data types
for col in numeric_columns:
    train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')

In [7]:
# Now fill missing values
train_data[numeric_columns] = train_data[numeric_columns].fillna(train_data[numeric_columns].median())
test_data[numeric_columns] = test_data[numeric_columns].fillna(test_data[numeric_columns].median())

In [8]:
# For categorical columns, fill with the mode
train_data[categorical_columns] = train_data[categorical_columns].fillna(train_data[categorical_columns].mode().iloc[0])
test_data[categorical_columns] = test_data[categorical_columns].fillna(test_data[categorical_columns].mode().iloc[0])

In [9]:
# Encode categorical columns
label_encoder = LabelEncoder()
for column in categorical_columns:
    train_data[column] = label_encoder.fit_transform(train_data[column])
    test_data[column] = label_encoder.transform(test_data[column])

In [10]:
# Feature scaling
scaler = StandardScaler()
train_data[numeric_columns] = scaler.fit_transform(train_data[numeric_columns])
test_data[numeric_columns] = scaler.transform(test_data[numeric_columns])

In [11]:
# Split features and target
X = train_data.drop(columns=['Application Status'])
y = train_data['Application Status'].map({'APPROVED': 1, 'REJECTED': 0})

In [12]:
# Train/test split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [14]:
# Apply one-hot encoding to categorical columns
train_data_encoded = pd.get_dummies(train_data, columns=categorical_columns)
test_data_encoded = pd.get_dummies(test_data, columns=categorical_columns)

In [15]:
# Align the train and test datasets by keeping only common columns
X_train, X_test = train_data_encoded.align(test_data_encoded, join='inner', axis=1)

In [16]:
# Extract target variable (assuming 'Application Status' is the target)
y_train = train_data['Application Status']

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
# Split the encoded training data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [23]:
# Define the column types
numerical_cols = ['Cibil Score', 'TOTAL ASSET COST', 'AGE', 'APPLIED AMOUNT']  # Example numerical columns
categorical_cols = ['AADHAR VERIFIED', 'MARITAL STATUS', 'GENDER', 'ADDRESS TYPE']  # Example categorical columns

In [24]:
# Create a preprocessor for numerical, categorical, and text data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Scale numerical data
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # One-hot encode categorical data
    ]
)

In [25]:
# Create a pipeline to combine preprocessor and Logistic Regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [34]:
# Specify the actual required column names here
required_columns = ['AADHAR VERIFIED_0', 'GENDER_0', 'GENDER_1', 'TOTAL ASSET COST', 'Cibil Score', 'APPLIED AMOUNT', 'AGE']  # Add other necessary column names

In [35]:
# Check for missing columns
missing_columns = [col for col in required_columns if col not in X_train.columns]

In [36]:
# If any required columns are missing, print them
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    print("No missing columns")

No missing columns


In [44]:
import pandas as pd
df = pd.read_csv('Assignment_Train.csv')  
print(df.columns)

Index(['DEALER ID', 'APPLICATION LOGIN DATE', 'HDB BRANCH NAME',
       'HDB BRANCH STATE', 'FIRST NAME', 'MIDDLE NAME', 'LAST NAME', 'mobile',
       'AADHAR VERIFIED', 'Cibil Score', 'MOBILE VERIFICATION', 'DEALER NAME',
       'TOTAL ASSET COST', 'ASSET CTG', 'ASSET MODEL NO', 'APPLIED AMOUNT',
       'PRIMARY ASSET MAKE', 'Primary Asset Model No',
       'Personal Email Address', 'MARITAL STATUS', 'GENDER', 'DOB', 'AGE',
       'ADDRESS TYPE', 'EMPLOY CONSTITUTION', 'EMPLOYER NAME', 'EMPLOYER TYPE',
       'Pan Name', 'name', 'vpa', 'upi_name', 'Phone Social Premium.a23games',
       'Phone Social Premium.amazon', 'Phone Social Premium.byjus',
       'Phone Social Premium.flipkart', 'Phone Social Premium.housing',
       'Phone Social Premium.indiamart', 'Phone Social Premium.instagram',
       'Phone Social Premium.isWABusiness',
       'Phone Social Premium.jeevansaathi', 'Phone Social Premium.jiomart',
       'Phone Social Premium.microsoft', 'Phone Social Premium.my11',
       

In [45]:
# Preview the first few rows of the DataFrame
print(df.head())

   DEALER ID APPLICATION LOGIN DATE HDB BRANCH NAME HDB BRANCH STATE  \
0     106989             07/20/2022        DELHI-SF            DELHI   
1     108975             07/28/2022        PATNA-SF            BIHAR   
2     111004             07/15/2022   DARJEELING-SF      WEST BENGAL   
3     192020               07/04/22   SAHARANPUR-SF    UTTAR PRADESH   
4      55095             07/15/2022       MODASA-SF          GUJARAT   

  FIRST NAME MIDDLE NAME  LAST NAME      mobile AADHAR VERIFIED Cibil Score  \
0      SUNIL         NaN    CHANDER  9210574080              NO         726   
1      AMRIT         NaN      KUMAR  8877987018              NO         NaN   
2    ANIMESH         NaN      THAPA  8910862135              NO         737   
3     ADITYA         NaN      SINGH  9758428017              NO         713   
4     PARMAR  HARESHBHAI  AMRUTBHAI  9687028486              NO         669   

   ...  Phone Social Premium.shaadi Phone Social Premium.skype  \
0  ...                    

In [46]:
# Example: Selecting columns with 'AADHAR VERIFIED'
import re

In [47]:
cols_to_use = [col for col in df.columns if re.search(r'AADHAR VERIFIED', col)]
print(cols_to_use)

['AADHAR VERIFIED']


In [48]:
# Clean column names
df.columns = df.columns.str.strip().str.lower()

In [49]:
# Then use the cleaned name
cols_to_use = [col for col in df.columns if 'aadhar verified' in col]

In [50]:
# Verify that all necessary columns are in the DataFrame
required_columns = ['AADHAR VERIFIED', 'Cibil Score', 'TOTAL ASSET COST', 'APPLIED AMOUNT', 'AGE']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")

Missing columns: ['AADHAR VERIFIED', 'Cibil Score', 'TOTAL ASSET COST', 'APPLIED AMOUNT', 'AGE']


In [52]:
# Standardize column names to lowercase and remove extra spaces
df.columns = df.columns.str.lower().str.strip()

In [53]:
# Define the required columns in lowercase
required_columns = ['aadhar verified', 'cibil score', 'total asset cost', 'applied amount', 'age']

In [54]:
# Verify if the required columns are present
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    print("All required columns are present.")

All required columns are present.


In [65]:
print(X_train.columns)

Index(['Cibil Score', 'TOTAL ASSET COST', 'ASSET CTG', 'APPLIED AMOUNT', 'DOB',
       'AGE', 'ADDRESS TYPE', 'EMPLOY CONSTITUTION', 'EMPLOYER NAME',
       'EMPLOYER TYPE', 'Phone Social Premium.a23games',
       'Phone Social Premium.amazon', 'Phone Social Premium.byjus',
       'Phone Social Premium.flipkart', 'Phone Social Premium.housing',
       'Phone Social Premium.indiamart', 'Phone Social Premium.instagram',
       'Phone Social Premium.isWABusiness',
       'Phone Social Premium.jeevansaathi', 'Phone Social Premium.jiomart',
       'Phone Social Premium.microsoft', 'Phone Social Premium.my11',
       'Phone Social Premium.paytm', 'Phone Social Premium.rummycircle',
       'Phone Social Premium.shaadi', 'Phone Social Premium.skype',
       'Phone Social Premium.toi', 'Phone Social Premium.whatsapp',
       'Phone Social Premium.yatra', 'Phone Social Premium.zoho',
       'phone_digitalage', 'phone_nameMatchScore',
       'phone_phoneFootprintStrengthOverall', 'AADHAR VERIFIED

In [66]:
X_train.columns = X_train.columns.str.strip()

In [67]:
print(set(X_train.columns) - set(X_test.columns))  # To see differences

set()


In [68]:
print('AADHAR VERIFIED' in X_train.columns)  # This should return True if the column exists

False


In [71]:
# Example of ColumnTransformer without the missing column
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Adjust numerical_features to include existing columns
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  # Adjust categorical_features similarly
    ]
)

In [77]:
numerical_features = ['Cibil Score', 'TOTAL ASSET COST', 'APPLIED AMOUNT', 'AGE']  # Add columns that should be numerical
categorical_features = [col for col in X_train.columns if col not in numerical_features]  # Automatically detect categorical columns

In [78]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [79]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [80]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier  # or your chosen model

In [81]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())  # Replace with your model
])

In [82]:
# Fit the pipeline
pipeline.fit(X_train, y_train)

In [83]:
from sklearn.metrics import accuracy_score

In [84]:
# Train the pipeline
pipeline.fit(X_train, y_train)

In [85]:
# Predict on test data
y_pred = pipeline.predict(X_test)

In [146]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.5330
