In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load the training data
train_df = pd.read_csv('Assignment_Train.csv')

# Prepare the features and target
features = ['AADHAR VERIFIED', 'Cibil Score', 'MOBILE VERIFICATION', 'TOTAL ASSET COST',
            'APPLIED AMOUNT', 'AGE', 'GENDER', 'MARITAL STATUS', 'ADDRESS TYPE',
            'EMPLOY CONSTITUTION', 'EMPLOYER TYPE', 'phone_digitalage',
            'phone_nameMatchScore', 'phone_phoneFootprintStrengthOverall']

X = train_df[features]
y = train_df['Application Status']

# Separate numeric and categorical columns
numeric_cols = ['Cibil Score', 'TOTAL ASSET COST', 'APPLIED AMOUNT', 'AGE',
                'phone_digitalage', 'phone_nameMatchScore', 'phone_phoneFootprintStrengthOverall']
categorical_cols = ['AADHAR VERIFIED', 'MOBILE VERIFICATION', 'GENDER', 'MARITAL STATUS',
                    'ADDRESS TYPE', 'EMPLOY CONSTITUTION', 'EMPLOYER TYPE']

# Convert non-numeric values in numeric columns to NaN (e.g., strings like '-')
for col in numeric_cols:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Remove columns that have only missing values
X = X.dropna(axis=1, how='all')

# Update numeric_cols to reflect columns that remain after dropping
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Handle missing values for numeric columns
numeric_imputer = SimpleImputer(strategy='median')
X[numeric_cols] = pd.DataFrame(numeric_imputer.fit_transform(X[numeric_cols]), columns=numeric_cols)

# Handle missing values for categorical columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_cols] = pd.DataFrame(categorical_imputer.fit_transform(X[categorical_cols]), columns=categorical_cols)

# Drop rows with missing values in both X and y
missing_indices = X[X.isnull().any(axis=1)].index
X = X.drop(missing_indices)
y = y.drop(missing_indices)

# Encode categorical variables
le_dict = {}  # Dictionary to store the LabelEncoders for each column
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    le_dict[col] = le  # Save the encoder for the test set

# Train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)


from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the training data
train_predictions = rf_model.predict(X)

# Calculate accuracy
accuracy = accuracy_score(y, train_predictions)
print(f"Training Accuracy: {accuracy:.2f}")

# Generate the confusion matrix
conf_matrix = confusion_matrix(y, train_predictions)
print("\nConfusion Matrix:")
print(conf_matrix)

# Detailed classification report (Precision, Recall, F1-Score)
class_report = classification_report(y, train_predictions)
print("\nClassification Report:")
print(class_report)


# Load the test data
test_df = pd.read_csv('Assignment_Test.csv')

# Prepare the test features
X_test = test_df[features]

# Convert non-numeric values in numeric columns to NaN for test data (e.g., strings like '-')
for col in numeric_cols:
    X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

# Remove columns that have only missing values in the test set as well
X_test = X_test.dropna(axis=1, how='all')

# Synchronize columns between training and test data
X_test = X_test[X.columns]

# Handle missing values in test data
X_test[numeric_cols] = pd.DataFrame(numeric_imputer.transform(X_test[numeric_cols]), columns=numeric_cols)
X_test[categorical_cols] = pd.DataFrame(categorical_imputer.transform(X_test[categorical_cols]), columns=categorical_cols)

# Encode categorical variables in test data
for col in categorical_cols:
    # Handle unseen labels by assigning -1 or a fallback value for new categories
    X_test[col] = X_test[col].map(lambda s: le_dict[col].transform([s])[0] if s in le_dict[col].classes_ else -1)

# Make predictions
predictions = rf_model.predict(X_test)

# Create the predictions DataFrame
predictions_df = pd.DataFrame({
    'UID': test_df['UID'],
    'Prediction': predictions
})

# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

print("Predictions have been saved to 'predictions.csv'")

# Print some information about the predictions
print("\nPrediction Summary:")
print(predictions_df['Prediction'].value_counts(normalize=True))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = pd.to_numeric(X[col], errors='coerce')


Training Accuracy: 1.00

Confusion Matrix:
[[6671    6]
 [   3 3320]]

Classification Report:
              precision    recall  f1-score   support

    APPROVED       1.00      1.00      1.00      6677
    DECLINED       1.00      1.00      1.00      3323

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col] = pd.to_numeric(X_test[col], errors='coerce')


Predictions have been saved to 'predictions.csv'

Prediction Summary:
Prediction
APPROVED    0.681
DECLINED    0.319
Name: proportion, dtype: float64
