In [17]:
import pandas as pd
data_descriptions = pd.read_csv('data_descriptions.csv')
pd.set_option('display.max_colwidth', None)
data_descriptions

Unnamed: 0,Column_name,Column_type,Data_type,Description
0,LoanID,Identifier,string,A unique identifier for each loan.
1,Age,Feature,integer,The age of the borrower.
2,Income,Feature,integer,The annual income of the borrower.
3,LoanAmount,Feature,integer,The amount of money being borrowed.
4,CreditScore,Feature,integer,"The credit score of the borrower, indicating their creditworthiness."
5,MonthsEmployed,Feature,integer,The number of months the borrower has been employed.
6,NumCreditLines,Feature,integer,The number of credit lines the borrower has open.
7,InterestRate,Feature,float,The interest rate for the loan.
8,LoanTerm,Feature,integer,The term length of the loan in months.
9,DTIRatio,Feature,float,"The Debt-to-Income ratio, indicating the borrower's debt compared to their income."


In [1]:
# Import required packages

# Data packages
import pandas as pd
import numpy as np

# Machine Learning / Classification packages
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier

# Visualization Packages
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [12]:
from sklearn.linear_model import LogisticRegression

In [29]:
train_df = pd.read_csv("train.csv")
print('train_df Shape:', train_df.shape)
train_df.head()

train_df Shape: (255347, 18)


Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [28]:
test_df = pd.read_csv("test.csv")
print('test_df Shape:', test_df.shape)
test_df.head()

test_df Shape: (109435, 17)


Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,7RYZGMKJIR,32,131645,43797,802,23,2,6.1,24,0.13,High School,Full-time,Divorced,Yes,No,Other,No
1,JDL5RH07AM,61,134312,18402,369,87,2,12.99,60,0.59,High School,Self-employed,Single,No,No,Business,Yes
2,STAL716Y79,55,115809,151774,563,3,3,5.51,48,0.82,Bachelor's,Full-time,Single,Yes,Yes,Other,Yes
3,SO0KKJ3IQB,58,94970,55789,337,24,1,23.93,36,0.77,Bachelor's,Unemployed,Divorced,No,No,Business,No
4,T99CWTYDCP,63,71727,189798,451,52,3,22.05,48,0.44,PhD,Unemployed,Single,Yes,No,Auto,No


## Make predictions

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Read the dataset
train_df = pd.read_csv("train.csv")

# Preserve the 'LoanID' column for later use in the prediction_df
loan_ids = train_df['LoanID']

# Drop 'LoanID' column
train_df = train_df.drop(['LoanID'], axis=1)

# Define the list of columns to one-hot encode
columns_to_encode = [
    'Education',
    'EmploymentType',
    'MaritalStatus',
    'HasMortgage',
    'HasDependents',
    'LoanPurpose',
    'HasCoSigner'
]

# Perform one-hot encoding for the specified columns
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_columns = encoder.fit_transform(train_df[columns_to_encode])

# Get the unique category names for each column
category_names = encoder.get_feature_names_out(input_features=columns_to_encode)

# Create a DataFrame from the one-hot encoded data with meaningful column names
encoded_df = pd.DataFrame(encoded_columns, columns=category_names)

# Concatenate the one-hot encoded DataFrame with the remaining features
train_df = pd.concat([train_df.drop(columns=columns_to_encode), encoded_df], axis=1)

# Split the data into features (X) and the target variable (y)
X = train_df.drop('Default', axis=1)
y = train_df['Default']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a RandomForestClassifier (you can use any classifier of your choice)
classifier = RandomForestClassifier(random_state=42)

# Fit the classifier on the training data
classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Evaluate the classifier's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.89


In [31]:
import pandas as pd

# Load the test dataset
test_df = pd.read_csv("test.csv")

loan_ids_test = test_df['LoanID']

# Drop the 'LoanID' column (assuming 'LoanID' is equivalent to 'CustomerID' in the test dataset)
test_df = test_df.drop(['LoanID'], axis=1)

# Define the list of columns to one-hot encode (use the same columns as in the training data)
columns_to_encode = [
    'Education',
    'EmploymentType',
    'MaritalStatus',
    'HasMortgage',
    'HasDependents',
    'LoanPurpose',
    'HasCoSigner'
]

# Perform one-hot encoding for the specified columns using the same encoder
encoded_columns = encoder.transform(test_df[columns_to_encode])

# Get the unique category names for each column
category_names = encoder.get_feature_names_out(input_features=columns_to_encode)

# Create a DataFrame from the one-hot encoded data with meaningful column names
encoded_df = pd.DataFrame(encoded_columns, columns=category_names)

# Concatenate the one-hot encoded DataFrame with the remaining features
test_df = pd.concat([test_df.drop(columns=columns_to_encode), encoded_df], axis=1)

# Use the trained classifier to make predictions on the test data
predicted_probabilities = classifier.predict_proba(test_df)[:, 1]

# Create a DataFrame for predictions with 'LoanID' and 'predicted_probability' columns
prediction_df = pd.DataFrame({
    'LoanID': loan_ids_test,  # Assuming 'LoanID' exists in the test data
    'predicted_probability': predicted_probabilities
})

# Ensure the 'prediction_df' DataFrame contains exactly 109,435 rows
if prediction_df.shape[0] != 109435:
    raise ValueError("The 'prediction_df' DataFrame does not contain the expected number of rows.")

# Save the 'prediction_df' DataFrame to a CSV file for submission
prediction_df.to_csv('predictions.csv', index=False)
print(prediction_df.shape)
prediction_df.head(10)


(109435, 2)


Unnamed: 0,LoanID,predicted_probability
0,7RYZGMKJIR,0.07
1,JDL5RH07AM,0.07
2,STAL716Y79,0.05
3,SO0KKJ3IQB,0.23
4,T99CWTYDCP,0.06
5,0SNHFWV4UP,0.12
6,S6ITP6LGYS,0.0
7,A6I7U12IRJ,0.09
8,8W6KY50JU4,0.12
9,THFQ08OLMU,0.11


In [None]:

# Combine predictions with label column into a dataframe
prediction_df = pd.DataFrame({'LoanID': test_df[['LoanID']].values[:, 0],
                             'predicted_probability': predicted_probability})

In [None]:

print(prediction_df.shape)
prediction_df.head(10)