In [42]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('Assignment_Train.csv')
test_df = pd.read_csv('Assignment_Test.csv')
feature_dict = pd.read_excel('Assignment_FeatureDictionary.xlsx')

# Display the first few rows of each dataset
print("Training Data:")
print(train_df.head())

print("\nTest Data:")
print(test_df.head())

print("\nFeature Dictionary:")
print(feature_dict.head())


Training Data:
   DEALER ID APPLICATION LOGIN DATE HDB BRANCH NAME HDB BRANCH STATE  \
0     106989             07/20/2022        DELHI-SF            DELHI   
1     108975             07/28/2022        PATNA-SF            BIHAR   
2     111004             07/15/2022   DARJEELING-SF      WEST BENGAL   
3     192020               07/04/22   SAHARANPUR-SF    UTTAR PRADESH   
4      55095             07/15/2022       MODASA-SF          GUJARAT   

  FIRST NAME MIDDLE NAME  LAST NAME      mobile AADHAR VERIFIED Cibil Score  \
0      SUNIL         NaN    CHANDER  9210574080              NO         726   
1      AMRIT         NaN      KUMAR  8877987018              NO         NaN   
2    ANIMESH         NaN      THAPA  8910862135              NO         737   
3     ADITYA         NaN      SINGH  9758428017              NO         713   
4     PARMAR  HARESHBHAI  AMRUTBHAI  9687028486              NO         669   

   ...  Phone Social Premium.shaadi Phone Social Premium.skype  \
0  ...     

Data Preprocessing
a. Handle Missing Values

In [43]:
# Identify numerical and categorical columns
numerical_cols = train_df.select_dtypes(include=['number']).columns
categorical_cols = train_df.select_dtypes(include=['object']).columns

# Check for missing values
print("Numerical Columns with Missing Values:")
print(train_df[numerical_cols].isnull().sum())

print("\nCategorical Columns with Missing Values:")
print(train_df[categorical_cols].isnull().sum())


Numerical Columns with Missing Values:
DEALER ID                               0
mobile                                  0
TOTAL ASSET COST                     5108
ASSET MODEL NO                          0
APPLIED AMOUNT                          0
DOB                                     0
AGE                                     0
Phone Social Premium.a23games        9999
Phone Social Premium.amazon          1916
Phone Social Premium.byjus           1948
Phone Social Premium.flipkart        1832
Phone Social Premium.housing         1776
Phone Social Premium.indiamart       1775
Phone Social Premium.instagram       6630
Phone Social Premium.isWABusiness    8427
Phone Social Premium.jeevansaathi    1829
Phone Social Premium.jiomart         9590
Phone Social Premium.microsoft       1872
Phone Social Premium.my11            9998
Phone Social Premium.paytm           1757
Phone Social Premium.rummycircle     9999
Phone Social Premium.shaadi          1779
Phone Social Premium.skype           

Handle Missing Values
B. Numerical Columns

In [48]:
# Fill missing values in numerical columns with the mean
train_df[numerical_cols] = train_df[numerical_cols].fillna(train_df[numerical_cols].mean())
test_df[numerical_cols] = test_df[numerical_cols].fillna(test_df[numerical_cols].mean())


b. Categorical Columns



In [50]:
import pandas as pd
import numpy as np

# Load the training dataset
train_df = pd.read_csv('Assignment_Train.csv')

# Assign column names based on the list provided
train_df.columns = [
    'DEALER_ID', 'APPLICATION_LOGIN_DATE', 'HDB_BRANCH_NAME', 'HDB_BRANCH_STATE', 'FIRST_NAME',
    'MIDDLE_NAME', 'LAST_NAME', 'MOBILE', 'AADHAR_VERIFIED', 'CIBIL_SCORE', 'MOBILE_VERIFICATION',
    'DEALER_NAME', 'TOTAL_ASSET_COST', 'ASSET_CTG', 'ASSET_MODEL_NO', 'APPLIED_AMOUNT',
    'PRIMARY_ASSET_MAKE', 'PRIMARY_ASSET_MODEL_NO', 'PERSONAL_EMAIL_ADDRESS', 'MARITAL_STATUS',
    'GENDER', 'DOB', 'AGE', 'ADDRESS_TYPE', 'EMPLOY_CONSTITUTION', 'EMPLOYER_NAME', 'EMPLOYER_TYPE',
    'PAN_NAME', 'NAME_UPI', 'VPA', 'UPI_NAME', 'PHONE_SOCIAL_PREMIUM_A23GAMES',
    'PHONE_SOCIAL_PREMIUM_AMAZON', 'PHONE_SOCIAL_PREMIUM_BYJUS', 'PHONE_SOCIAL_PREMIUM_FLIPKART',
    'PHONE_SOCIAL_PREMIUM_HOUSING', 'PHONE_SOCIAL_PREMIUM_INDIAMART', 'PHONE_SOCIAL_PREMIUM_INSTAGRAM',
    'PHONE_SOCIAL_PREMIUM_ISWABUSINESS', 'PHONE_SOCIAL_PREMIUM_JEEVANSATHI',
    'PHONE_SOCIAL_PREMIUM_JIOMART', 'PHONE_SOCIAL_PREMIUM_MICROSOFT', 'PHONE_SOCIAL_PREMIUM_MY11',
    'PHONE_SOCIAL_PREMIUM_PAYTM', 'PHONE_SOCIAL_PREMIUM_RUMMYCIRCLE', 'PHONE_SOCIAL_PREMIUM_SHAADI',
    'PHONE_SOCIAL_PREMIUM_SKYPE', 'PHONE_SOCIAL_PREMIUM_TOI', 'PHONE_SOCIAL_PREMIUM_WHATSAPP',
    'PHONE_SOCIAL_PREMIUM_YATRA', 'PHONE_SOCIAL_PREMIUM_ZOHO', 'PHONE_DIGITALAGE',
    'PHONE_NAMEMATCHSCORE', 'PHONE_PHONEFOOTPRINTSTRENGTHOVERALL', 'APPLICATION_STATUS'
]

# Check the first few rows of the dataframe to ensure columns are assigned correctly
print(train_df.head())

# Convert `APPLICATION_LOGIN_DATE` to datetime
train_df['APPLICATION_LOGIN_DATE'] = pd.to_datetime(train_df['APPLICATION_LOGIN_DATE'], errors='coerce')

# Convert `DOB` to datetime and calculate age if not available
train_df['DOB'] = pd.to_datetime(train_df['DOB'], errors='coerce')

# Fill missing `AGE` values based on `DOB`
train_df['AGE'] = train_df['AGE'].fillna(train_df['DOB'].apply(lambda dob: pd.Timestamp.now().year - dob.year if pd.notnull(dob) else np.nan))

# Fill missing categorical data
train_df['AADHAR_VERIFIED'].fillna('NO', inplace=True)
train_df['MOBILE_VERIFICATION'].fillna('FALSE', inplace=True)
train_df['GENDER'].fillna('Unknown', inplace=True)
train_df['MARITAL_STATUS'].fillna('Unknown', inplace=True)

# Convert 'CIBIL_SCORE' to numeric, forcing non-numeric values to NaN
train_df['CIBIL_SCORE'] = pd.to_numeric(train_df['CIBIL_SCORE'], errors='coerce')

# Now you can calculate the mean of the numeric values and fill NaN
train_df['CIBIL_SCORE'] = train_df['CIBIL_SCORE'].fillna(train_df['CIBIL_SCORE'].mean())

# Binary encoding for categorical variables
train_df['AADHAR_VERIFIED'] = train_df['AADHAR_VERIFIED'].apply(lambda x: 1 if x == 'YES' else 0)
train_df['MOBILE_VERIFICATION'] = train_df['MOBILE_VERIFICATION'].apply(lambda x: 1 if x == 'TRUE' else 0)

# Encoding categorical columns (example using `GENDER`)
train_df = pd.get_dummies(train_df, columns=['GENDER', 'MARITAL_STATUS', 'ADDRESS_TYPE', 'EMPLOY_CONSTITUTION', 'EMPLOYER_TYPE'])

# Feature engineering
# Create feature from `APPLICATION_LOGIN_DATE`
train_df['APPLICATION_YEAR'] = train_df['APPLICATION_LOGIN_DATE'].dt.year
train_df['APPLICATION_MONTH'] = train_df['APPLICATION_LOGIN_DATE'].dt.month

# Drop columns that are not relevant or have high cardinality (e.g., names, email addresses, phone numbers)
train_df = train_df.drop(['FIRST_NAME', 'MIDDLE_NAME', 'LAST_NAME', 'MOBILE', 'PERSONAL_EMAIL_ADDRESS', 'PAN_NAME', 'NAME_UPI', 'VPA'], axis=1)

# Check for remaining missing values
missing_values = train_df.isnull().sum()
print("Missing Values: \n", missing_values[missing_values > 0])

# Separate target variable
X_train = train_df.drop(['APPLICATION_STATUS'], axis=1)
y_train = train_df['APPLICATION_STATUS']

# Encode the target variable (APPROVED = 1, DECLINED = 0)
y_train = y_train.apply(lambda x: 1 if x == 'APPROVED' else 0)

# Now the data is ready for model training


   DEALER_ID APPLICATION_LOGIN_DATE HDB_BRANCH_NAME HDB_BRANCH_STATE  \
0     106989             07/20/2022        DELHI-SF            DELHI   
1     108975             07/28/2022        PATNA-SF            BIHAR   
2     111004             07/15/2022   DARJEELING-SF      WEST BENGAL   
3     192020               07/04/22   SAHARANPUR-SF    UTTAR PRADESH   
4      55095             07/15/2022       MODASA-SF          GUJARAT   

  FIRST_NAME MIDDLE_NAME  LAST_NAME      MOBILE AADHAR_VERIFIED CIBIL_SCORE  \
0      SUNIL         NaN    CHANDER  9210574080              NO         726   
1      AMRIT         NaN      KUMAR  8877987018              NO         NaN   
2    ANIMESH         NaN      THAPA  8910862135              NO         737   
3     ADITYA         NaN      SINGH  9758428017              NO         713   
4     PARMAR  HARESHBHAI  AMRUTBHAI  9687028486              NO         669   

   ...  PHONE_SOCIAL_PREMIUM_SHAADI PHONE_SOCIAL_PREMIUM_SKYPE  \
0  ...                    

In [73]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load data
train_data = pd.read_csv('Assignment_Train.csv')
test_data = pd.read_csv('Assignment_Test.csv')

# Save 'UID' before dropping columns
test_uid = test_data['UID'] if 'UID' in test_data.columns else None

# Drop unnecessary columns
drop_cols = ['UID', 'FIRST NAME', 'MIDDLE NAME', 'LAST NAME', 'Personal Email Address', 'DEALER NAME', 'Pan Name', 'name', 'vpa', 'upi_name']
train_data = train_data.drop(drop_cols, axis=1, errors='ignore')
test_data = test_data.drop(drop_cols, axis=1, errors='ignore')

# Restore 'UID' after preprocessing
if test_uid is not None:
    test_data['UID'] = test_uid

# Convert 'APPLICATION LOGIN DATE' to datetime and extract features
for data in [train_data, test_data]:
    if 'APPLICATION LOGIN DATE' in data.columns:
        data['APPLICATION LOGIN DATE'] = pd.to_datetime(data['APPLICATION LOGIN DATE'], format='%m/%d/%Y', errors='coerce')
        data['Login_Year'] = data['APPLICATION LOGIN DATE'].dt.year
        data['Login_Month'] = data['APPLICATION LOGIN DATE'].dt.month
        data['Login_Day'] = data['APPLICATION LOGIN DATE'].dt.day
        data.drop('APPLICATION LOGIN DATE', axis=1, inplace=True)

# Separate features and target
X_train = train_data.drop('Application Status', axis=1, errors='ignore')
y_train = train_data['Application Status']
X_test = test_data

# Encode target variable
y_train = y_train.astype('category').cat.codes

# Identify categorical columns
categorical_columns = ['HDB BRANCH NAME', 'AADHAR VERIFIED', 'MOBILE VERIFICATION', 'MARITAL STATUS', 'GENDER', 'ADDRESS TYPE', 'EMPLOY CONSTITUTION', 'EMPLOYER TYPE']

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),  # Impute missing numeric values
            ('scaler', StandardScaler())  # Standardize numeric features
        ]), X_train.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical values
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # One-hot encode categorical features
        ]), categorical_columns)
    ]
)

# Create and fit the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Fit the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_train_pred = pipeline.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Training Classification Report:")
print(classification_report(y_train, y_train_pred))

# Make predictions on the test data
predictions = pipeline.predict(X_test)

# Create a DataFrame with UID and Prediction
if 'UID' in test_data.columns:
    submission = pd.DataFrame({
        'UID': test_data['UID'],  # Ensure UID is present in the test dataset
        'Prediction': predictions
    })
else:
    # Handle case where 'UID' is not available
    submission = pd.DataFrame({
        'Prediction': predictions
    })

# Save the predictions to a CSV file
submission.to_csv('predictions.csv', index=False)

print("Predictions saved to 'predictions.csv'")



Training Accuracy: 0.7595
Training Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      6677
           1       0.64      0.63      0.64      3323

    accuracy                           0.76     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.76      0.76      0.76     10000

Predictions saved to 'predictions.csv'


In [75]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Load data
train_data = pd.read_csv('Assignment_Train.csv')
test_data = pd.read_csv('Assignment_Test.csv')

# Save 'UID' before dropping columns
test_uid = test_data['UID'] if 'UID' in test_data.columns else None

# Drop unnecessary columns
drop_cols = ['UID', 'FIRST NAME', 'MIDDLE NAME', 'LAST NAME', 'Personal Email Address', 'DEALER NAME', 'Pan Name', 'name', 'vpa', 'upi_name']
train_data = train_data.drop(drop_cols, axis=1, errors='ignore')
test_data = test_data.drop(drop_cols, axis=1, errors='ignore')

# Restore 'UID' after preprocessing
if test_uid is not None:
    test_data['UID'] = test_uid

# Convert 'APPLICATION LOGIN DATE' to datetime and extract features
for data in [train_data, test_data]:
    if 'APPLICATION LOGIN DATE' in data.columns:
        data['APPLICATION LOGIN DATE'] = pd.to_datetime(data['APPLICATION LOGIN DATE'], format='%m/%d/%Y', errors='coerce')
        data['Login_Year'] = data['APPLICATION LOGIN DATE'].dt.year
        data['Login_Month'] = data['APPLICATION LOGIN DATE'].dt.month
        data['Login_Day'] = data['APPLICATION LOGIN DATE'].dt.day
        data.drop('APPLICATION LOGIN DATE', axis=1, inplace=True)

# Separate features and target
X_train = train_data.drop('Application Status', axis=1, errors='ignore')
y_train = train_data['Application Status']
X_test = test_data

# Encode target variable
y_train = y_train.astype('category').cat.codes

# Identify categorical columns
categorical_columns = ['HDB BRANCH NAME', 'AADHAR VERIFIED', 'MOBILE VERIFICATION', 'MARITAL STATUS', 'GENDER', 'ADDRESS TYPE', 'EMPLOY CONSTITUTION', 'EMPLOYER TYPE']

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),  # Impute missing numeric values
            ('scaler', StandardScaler())  # Standardize numeric features
        ]), X_train.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical values
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # One-hot encode categorical features
        ]), categorical_columns)
    ]
)

# Create a pipeline with a RandomForestClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
}

# Perform Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Evaluate the best model
y_train_pred = best_model.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Training Classification Report:")
print(classification_report(y_train, y_train_pred))

# Make predictions on the test data
predictions = best_model.predict(X_test)

# Create a DataFrame with UID and Prediction
if 'UID' in test_data.columns:
    submission = pd.DataFrame({
        'UID': test_data['UID'],  # Ensure UID is present in the test dataset
        'Prediction': predictions
    })
else:
    # Handle case where 'UID' is not available
    submission = pd.DataFrame({
        'Prediction': predictions
    })

# Save the predictions to a CSV file
submission.to_csv('predictions1.csv', index=False)

print("Predictions saved to 'predictions1.csv'")


Fitting 5 folds for each of 12 candidates, totalling 60 fits




Training Accuracy: 0.9981
Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6677
           1       1.00      0.99      1.00      3323

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

Predictions saved to 'predictions1.csv'
Fitting 5 folds for each of 12 candidates, totalling 60 fits




Training Accuracy: 0.9981
Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6677
           1       1.00      0.99      1.00      3323

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

Predictions saved to 'predictions1.csv'
