In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [32]:
# Loading the dataset

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [33]:
# Displaying initial data (head)
print("Initial Training Data:\n", train_df.head())
print("Initial Test Data:\n", test_df.head())

Initial Training Data:
        LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  DRIRC89L0T   18  137576      209136          846              26   
1  TS0FIUNHNU   47   57194        5970          748              30   
2  I0YR284A1V   26   84328       95065          453               7   
3  WB1T7NQV8A   53   49795      229582          533             107   
4  J6GU9M4G1Z   49  115450       22072          840               0   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               2         10.47        60      0.81  High School   
1               2         19.72        36      0.73  High School   
2               2         24.25        12      0.45     Master's   
3               3         14.44        60      0.17   Bachelor's   
4               4         24.48        12      0.11   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0  Self-employed        Single         Yes            No    Business

In [34]:
# Drop unnecessary columns

train_df = train_df.drop(columns=['LoanID'])
test_df = test_df.drop(columns=['LoanID'])
train_df = train_df.drop(columns=['LoanTerm'])
test_df = test_df.drop(columns=['LoanTerm'])
train_df

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,18,137576,209136,846,26,2,10.47,0.81,High School,Self-employed,Single,Yes,No,Business,No,0
1,47,57194,5970,748,30,2,19.72,0.73,High School,Unemployed,Divorced,No,Yes,Education,No,0
2,26,84328,95065,453,7,2,24.25,0.45,Master's,Self-employed,Married,No,No,Other,Yes,0
3,53,49795,229582,533,107,3,14.44,0.17,Bachelor's,Self-employed,Single,Yes,No,Auto,Yes,1
4,49,115450,22072,840,0,4,24.48,0.11,Bachelor's,Part-time,Single,No,Yes,Education,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204272,40,116623,161673,651,79,2,23.44,0.87,Bachelor's,Part-time,Divorced,No,No,Home,Yes,0
204273,67,62958,189499,460,77,3,9.29,0.11,Bachelor's,Self-employed,Single,No,No,Business,Yes,0
204274,62,34372,59645,524,94,3,9.72,0.24,PhD,Full-time,Single,Yes,No,Auto,No,0
204275,44,146262,198454,489,7,4,4.31,0.30,High School,Self-employed,Married,Yes,No,Home,No,0


In [35]:
# Encode categorical columns using LabelEncoder

categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [36]:
# Handle missing values by filling with the mean for numerical columns

imputer = SimpleImputer(strategy='mean')
# train_features = train_df.drop(columns=['Default'])  # Separate target variable
train_df.drop(columns=['Default'])
# train_target = train_df['Default']
# train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

NotFittedError: This SimpleImputer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [29]:
# Scale numerical features to ensure each feature has a mean of 0 and variance of 1

# scaler = StandardScaler()
# train_features = pd.DataFrame(scaler.fit_transform(train_features), columns=train_features.columns)
# test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)
train_df

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,18,137576,209136,846,26,2,10.47,0.81,1,2,2,1,0,1,0,0
1,47,57194,5970,748,30,2,19.72,0.73,1,3,0,0,1,2,0,0
2,26,84328,95065,453,7,2,24.25,0.45,2,2,1,0,0,4,1,0
3,53,49795,229582,533,107,3,14.44,0.17,0,2,2,1,0,0,1,1
4,49,115450,22072,840,0,4,24.48,0.11,0,1,2,0,1,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204272,40,116623,161673,651,79,2,23.44,0.87,0,1,0,0,0,3,1,0
204273,67,62958,189499,460,77,3,9.29,0.11,0,2,2,0,0,1,1,0
204274,62,34372,59645,524,94,3,9.72,0.24,3,0,2,1,0,0,0,0
204275,44,146262,198454,489,7,4,4.31,0.30,1,2,1,1,0,3,0,0


In [26]:
# Step 5: Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_target, test_size=0.2, random_state=42)

print("\nTraining and Validation Split:")
print("X_train:\n", X_train.head())
print("y_train:\n", y_train.head())
print("X_val:\n", X_val.head())
print("y_val:\n", y_val.head())


Training and Validation Split:
X_train:
              Age    Income  LoanAmount  CreditScore  MonthsEmployed  \
137918  0.100743 -0.245898   -1.471959    -0.365538        0.360552   
32623  -0.166010 -0.438212   -1.574662    -1.548846        0.620325   
20894  -0.632827  1.657855   -0.763257     0.207233        0.216233   
116646 -1.233021  0.387343    0.536470    -0.535481        1.601694   
3325   -1.433086  1.563688    0.980744    -0.944604       -1.284684   

        NumCreditLines  InterestRate  DTIRatio  Education  EmploymentType  \
137918       -1.344869      1.663921  1.253376  -0.442742        1.343560   
32623         1.341148      0.830593  0.820313  -0.442742        1.343560   
20894        -1.344869     -0.796883  0.690394   0.450557       -1.342074   
116646        1.341148      1.150061 -1.258390   0.450557       -1.342074   
3325          0.445809     -0.097671  0.473862   0.450557        0.448348   

        MaritalStatus  HasMortgage  HasDependents  LoanPurpose  HasC

In [27]:
# Initialize and train the Random Forest model

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model on the validation set

y_val_pred = rf_model.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_val_pred))

# Make predictions on the test set

test_predictions = rf_model.predict(test_df)

# Prepare the submission file
# Ensure we include 'LoanID' to match with sample_submission.csv structure.
# Assuming test_df had the LoanID column originally, we’ll add it back here for the submission.
submission_df = pd.DataFrame({
    'LoanID': pd.read_csv('test.csv')['LoanID'],  # Reload LoanID from the test CSV
    'Default': test_predictions  # Predicted defaults
})

# Save the predictions to a CSV file

submission_df.to_csv('random_forest_predictions.csv', index=False)


Validation Accuracy: 0.8848639122772665


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score

# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocessing: Drop unnecessary columns
train_df = train_df.drop(columns=['LoanID', 'LoanTerm'])
test_df = test_df.drop(columns=['LoanID', 'LoanTerm'])

# Encode categorical columns
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

# Handle missing values
imputer = SimpleImputer(strategy='mean')
train_features = train_df.drop(columns=['Default'])
train_target = train_df['Default']
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

# Scale features
scaler = StandardScaler()
train_features = pd.DataFrame(scaler.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)

# Split data into train and validation sets with stratification
X_train, X_val, y_train, y_val = train_test_split(
    train_features, train_target, test_size=0.2, stratify=train_target
)

# Set up a pipeline with PolynomialFeatures and LinearRegression
pipeline = Pipeline([
    ('poly_features', PolynomialFeatures()),
    ('linear_regression', LinearRegression())
])

# Define parameter grid
param_grid = {
    'poly_features__degree': [1, 2, 3],  # Test different polynomial degrees
    'linear_regression__fit_intercept': [True, False]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model from grid search and predict on validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Best Parameters:", grid_search.best_params_)
print("Root Mean Squared Error (RMSE) on validation set:", rmse)

# Optionally, round predictions for binary classification and calculate accuracy
y_val_pred_rounded = np.round(y_val_pred)
accuracy = accuracy_score(y_val, y_val_pred_rounded)
print("Accuracy on validation set:", accuracy)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END linear_regression__fit_intercept=True, poly_features__degree=2; total time=   4.5s
[CV] END linear_regression__fit_intercept=True, poly_features__degree=3; total time=10.1min
[CV] END linear_regression__fit_intercept=False, poly_features__degree=3; total time=  34.4s
[CV] END linear_regression__fit_intercept=True, poly_features__degree=1; total time=   0.1s
[CV] END linear_regression__fit_intercept=True, poly_features__degree=2; total time=   5.2s
[CV] END linear_regression__fit_intercept=False, poly_features__degree=1; total time=   0.1s
[CV] END linear_regression__fit_intercept=False, poly_features__degree=2; total time=   3.6s
[CV] END linear_regression__fit_intercept=False, poly_features__degree=3; total time=10.1min
[CV] END linear_regression__fit_intercept=False, poly_features__degree=3; total time=  35.2s


KeyboardInterrupt: 

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Loading the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Displaying initial data (head)
print("Initial Training Data:\n", train_df.head())
print("Initial Test Data:\n", test_df.head())

# Drop unnecessary columns (including target column 'Default' from the feature set)
train_df = train_df.drop(columns=['LoanID', 'LoanTerm'])
test_df = test_df.drop(columns=['LoanID', 'LoanTerm'])

# Encode categorical columns using LabelEncoder
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

# Handle missing values
imputer = SimpleImputer(strategy='mean')
train_features = train_df.drop(columns=['Default'])  # Drop 'Default' from features
train_target = train_df['Default']  # 'Default' is the target variable
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

# Scale features
scaler = StandardScaler()
train_features = pd.DataFrame(scaler.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)

# Split the training data into train and validation sets with stratification
X_train, X_val, y_train, y_val = train_test_split(
    train_features, train_target, test_size=0.2, stratify=train_target, random_state=42
)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = rf_model.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_val_pred))

# Make predictions on the test set
test_predictions = rf_model.predict(test_df)

# Prepare the submission file
# Ensure we include 'LoanID' to match with sample_submission.csv structure.
# Assuming test_df had the LoanID column originally, we’ll add it back here for the submission.
submission_df = pd.DataFrame({
    'LoanID': pd.read_csv('test.csv')['LoanID'],  # Reload LoanID from the test CSV
    'Default': test_predictions  # Predicted defaults
})

# Save the predictions to a CSV file
submission_df.to_csv('random_forest_predictions.csv', index=False)


Initial Training Data:
        LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  DRIRC89L0T   18  137576      209136          846              26   
1  TS0FIUNHNU   47   57194        5970          748              30   
2  I0YR284A1V   26   84328       95065          453               7   
3  WB1T7NQV8A   53   49795      229582          533             107   
4  J6GU9M4G1Z   49  115450       22072          840               0   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               2         10.47        60      0.81  High School   
1               2         19.72        36      0.73  High School   
2               2         24.25        12      0.45     Master's   
3               3         14.44        60      0.17   Bachelor's   
4               4         24.48        12      0.11   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0  Self-employed        Single         Yes            No    Business

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, accuracy_score

# Loading the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Displaying initial data (head)
print("Initial Training Data:\n", train_df.head())
print("Initial Test Data:\n", test_df.head())

# Drop unnecessary columns (including target column 'Default' from the feature set)
train_df = train_df.drop(columns=['LoanID', 'LoanTerm'])
test_df = test_df.drop(columns=['LoanID', 'LoanTerm'])

# Encode categorical columns using LabelEncoder
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

# Handle missing values
imputer = SimpleImputer(strategy='mean')
train_features = train_df.drop(columns=['Default'])  # Drop 'Default' from features
train_target = train_df['Default']  # 'Default' is the target variable
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

# Scale features
scaler = StandardScaler()
train_features = pd.DataFrame(scaler.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)

# Split the training data into train and validation sets with stratification
X_train, X_val, y_train, y_val = train_test_split(
    train_features, train_target, test_size=0.2, stratify=train_target, random_state=42
)

# Set up a pipeline with PolynomialFeatures and LinearRegression
pipeline = Pipeline([
    ('poly_features', PolynomialFeatures()),
    ('linear_regression', LinearRegression())
])

# Define parameter grid to tune the polynomial degree and linear regression settings
param_grid = {
    'poly_features__degree': [1, 2, 3],  # Test different polynomial degrees
    'linear_regression__fit_intercept': [True, False]
}

# Set up GridSearchCV to search for the best polynomial degree and fit_intercept setting
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model from grid search and predict on validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Best Parameters:", grid_search.best_params_)
print("Root Mean Squared Error (RMSE) on validation set:", rmse)

# Optionally, round predictions for binary classification and calculate accuracy
y_val_pred_rounded = np.round(y_val_pred)
accuracy = accuracy_score(y_val, y_val_pred_rounded)
print("Accuracy on validation set:", accuracy)

# Make predictions on the test set
test_predictions = best_model.predict(test_df)

# Prepare the submission file
# Ensure we include 'LoanID' to match with sample_submission.csv structure.
# Assuming test_df had the LoanID column originally, we’ll add it back here for the submission.
submission_df = pd.DataFrame({
    'LoanID': pd.read_csv('test.csv')['LoanID'],  # Reload LoanID from the test CSV
    'Default': test_predictions  # Predicted defaults
})

# Save the predictions to a CSV file
submission_df.to_csv('polynomial_regression_predictions.csv', index=False)


Initial Training Data:
        LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  DRIRC89L0T   18  137576      209136          846              26   
1  TS0FIUNHNU   47   57194        5970          748              30   
2  I0YR284A1V   26   84328       95065          453               7   
3  WB1T7NQV8A   53   49795      229582          533             107   
4  J6GU9M4G1Z   49  115450       22072          840               0   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               2         10.47        60      0.81  High School   
1               2         19.72        36      0.73  High School   
2               2         24.25        12      0.45     Master's   
3               3         14.44        60      0.17   Bachelor's   
4               4         24.48        12      0.11   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0  Self-employed        Single         Yes            No    Business

KeyboardInterrupt: 

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score

# Loading the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Displaying initial data (head)
print("Initial Training Data:\n", train_df.head())
print("Initial Test Data:\n", test_df.head())

# Drop unnecessary columns (including target column 'Default' from the feature set)
train_df = train_df.drop(columns=['LoanID', 'LoanTerm'])
test_df = test_df.drop(columns=['LoanID', 'LoanTerm'])

# Encode categorical columns using LabelEncoder
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

# Handle missing values
imputer = SimpleImputer(strategy='mean')
train_features = train_df.drop(columns=['Default'])  # Drop 'Default' from features
train_target = train_df['Default']  # 'Default' is the target variable
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

# Scale features
scaler = StandardScaler()
train_features = pd.DataFrame(scaler.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)

# Split the training data into train and validation sets with stratification
X_train, X_val, y_train, y_val = train_test_split(
    train_features, train_target, test_size=0.2, stratify=train_target, random_state=202
)

# Choose polynomial degree (e.g., 2 or 3)
poly_degree = 3

# Apply PolynomialFeatures
poly = PolynomialFeatures(degree=poly_degree)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)

# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train_poly, y_train)

# Evaluate the model on the validation set
y_val_pred = lr_model.predict(X_val_poly)

# Calculate RMSE (Root Mean Squared Error)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print("Root Mean Squared Error (RMSE) on validation set:", rmse)

# Optionally, round predictions for binary classification and calculate accuracy
y_val_pred_rounded = np.round(y_val_pred)
accuracy = accuracy_score(y_val, y_val_pred_rounded)
print("Accuracy on validation set:", accuracy)

# Make predictions on the test set using the best model
X_test_poly = poly.transform(test_df)  # Apply polynomial transformation to test set
test_predictions = lr_model.predict(X_test_poly)

# Round predictions to 0 or 1 for binary classification
test_predictions_rounded = np.round(test_predictions)

# Prepare the submission file
# Ensure we include 'LoanID' to match with sample_submission.csv structure.
submission_df = pd.DataFrame({
    'LoanID': pd.read_csv('test.csv')['LoanID'],  # Reload LoanID from the test CSV
    'Default': test_predictions_rounded  # Predicted defaults rounded to 0 or 1
})

# Save the predictions to a CSV file
submission_df.to_csv('polynomial_regression_predictions.csv', index=False)


Initial Training Data:
        LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  DRIRC89L0T   18  137576      209136          846              26   
1  TS0FIUNHNU   47   57194        5970          748              30   
2  I0YR284A1V   26   84328       95065          453               7   
3  WB1T7NQV8A   53   49795      229582          533             107   
4  J6GU9M4G1Z   49  115450       22072          840               0   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               2         10.47        60      0.81  High School   
1               2         19.72        36      0.73  High School   
2               2         24.25        12      0.45     Master's   
3               3         14.44        60      0.17   Bachelor's   
4               4         24.48        12      0.11   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0  Self-employed        Single         Yes            No    Business



Root Mean Squared Error (RMSE) on validation set: 0.3031700479516249
Accuracy on validation set: 0.8856226747601331


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score

# Loading the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Displaying initial data (head)
print("Initial Training Data:\n", train_df.head())
print("Initial Test Data:\n", test_df.head())

# Drop unnecessary columns (including target column 'Default' from the feature set)
train_df = train_df.drop(columns=['LoanID', 'LoanTerm'])
test_df = test_df.drop(columns=['LoanID', 'LoanTerm'])

# Encode categorical columns using LabelEncoder
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

# Handle missing values
imputer = SimpleImputer(strategy='mean')
train_features = train_df.drop(columns=['Default'])  # Drop 'Default' from features
train_target = train_df['Default']  # 'Default' is the target variable
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

# Scale features
scaler = StandardScaler()
train_features = pd.DataFrame(scaler.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)

# Choose polynomial degree (e.g., 2 or 3)
poly_degree = 3
poly = PolynomialFeatures(degree=poly_degree)

# Initialize variables to track the best model
best_accuracy = 0
best_random_state = None

# Loop through different random_state values
for state in [42, 86, 101, 202, 303, 404]:
    # Split the training data into train and validation sets with stratification
    X_train, X_val, y_train, y_val = train_test_split(
        train_features, train_target, test_size=0.2, stratify=train_target, random_state=state
    )

    # Apply PolynomialFeatures
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)

    # Initialize and train the Linear Regression model
    lr_model = LinearRegression()
    lr_model.fit(X_train_poly, y_train)

    # Evaluate the model on the validation set
    y_val_pred = lr_model.predict(X_val_poly)

    # Calculate RMSE (Root Mean Squared Error)
    rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    print(f"RMSE for random_state={state}: {rmse}")

    # Round predictions for binary classification and calculate accuracy
    y_val_pred_rounded = np.round(y_val_pred)
    accuracy = accuracy_score(y_val, y_val_pred_rounded)
    print(f"Accuracy for random_state={state}: {accuracy}")

    # Track the best accuracy and corresponding random_state
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_random_state = state

# Output the best accuracy and corresponding random_state
print(f"\nBest Accuracy: {best_accuracy} with random_state={best_random_state}")

# Final model using the best random_state
X_train, X_val, y_train, y_val = train_test_split(
    train_features, train_target, test_size=0.2, stratify=train_target, random_state=best_random_state
)

# Apply PolynomialFeatures to final train and validation data
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)

# Train the model
lr_model = LinearRegression()
lr_model.fit(X_train_poly, y_train)

# Make predictions on the test set
X_test_poly = poly.transform(test_df)
test_predictions = lr_model.predict(X_test_poly)

# Round predictions to 0 or 1 for binary classification
test_predictions_rounded = np.round(test_predictions)

# Prepare the submission file
submission_df = pd.DataFrame({
    'LoanID': pd.read_csv('test.csv')['LoanID'],  # Reload LoanID from the test CSV
    'Default': test_predictions_rounded  # Predicted defaults rounded to 0 or 1
})

# Save the predictions to a CSV file
submission_df.to_csv('polynomial_regression_predictions.csv', index=False)


Initial Training Data:
        LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  DRIRC89L0T   18  137576      209136          846              26   
1  TS0FIUNHNU   47   57194        5970          748              30   
2  I0YR284A1V   26   84328       95065          453               7   
3  WB1T7NQV8A   53   49795      229582          533             107   
4  J6GU9M4G1Z   49  115450       22072          840               0   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               2         10.47        60      0.81  High School   
1               2         19.72        36      0.73  High School   
2               2         24.25        12      0.45     Master's   
3               3         14.44        60      0.17   Bachelor's   
4               4         24.48        12      0.11   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0  Self-employed        Single         Yes            No    Business



RMSE for random_state=42: 0.30286117957401065
Accuracy for random_state=42: 0.8856716271783827




RMSE for random_state=86: 0.3045662815704034
Accuracy for random_state=86: 0.8854268650871353




RMSE for random_state=101: 0.3032215155691703
Accuracy for random_state=101: 0.886185627570002




RMSE for random_state=202: 0.3031700479516249
Accuracy for random_state=202: 0.8856226747601331




RMSE for random_state=303: 0.3028660730931155
Accuracy for random_state=303: 0.886724104170746




RMSE for random_state=404: 0.30393243931376557
Accuracy for random_state=404: 0.8861121989426277

Best Accuracy: 0.886724104170746 with random_state=303


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score

# Loading the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Displaying initial data (head)
print("Initial Training Data:\n", train_df.head())
print("Initial Test Data:\n", test_df.head())

# Drop unnecessary columns (including target column 'Default' from the feature set)
train_df = train_df.drop(columns=['LoanID', 'LoanTerm'])
test_df = test_df.drop(columns=['LoanID', 'LoanTerm'])

# Encode categorical columns using LabelEncoder
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

# Handle missing values
imputer = SimpleImputer(strategy='mean')
train_features = train_df.drop(columns=['Default'])  # Drop 'Default' from features
train_target = train_df['Default']  # 'Default' is the target variable
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

# Scale features
scaler = StandardScaler()
train_features = pd.DataFrame(scaler.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)

# Choose polynomial degree (e.g., 2 or 3)
poly_degree = 3
poly = PolynomialFeatures(degree=poly_degree)

# Initialize variables to track the best model
best_accuracy = 0
best_random_state = None

# Loop through different random_state values in the range [1, 100]
for state in range(1, 101):  # random_state from 1 to 100
    # Split the training data into train and validation sets with stratification
    X_train, X_val, y_train, y_val = train_test_split(
        train_features, train_target, test_size=0.2, stratify=train_target, random_state=state
    )

    # Apply PolynomialFeatures
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)

    # Initialize and train the Linear Regression model
    lr_model = LinearRegression()
    lr_model.fit(X_train_poly, y_train)

    # Evaluate the model on the validation set
    y_val_pred = lr_model.predict(X_val_poly)

    # Calculate RMSE (Root Mean Squared Error)
    rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    print(f"RMSE for random_state={state}: {rmse}")

    # Round predictions for binary classification and calculate accuracy
    y_val_pred_rounded = np.round(y_val_pred)
    accuracy = accuracy_score(y_val, y_val_pred_rounded)
    print(f"Accuracy for random_state={state}: {accuracy}")

    # Track the best accuracy and corresponding random_state
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_random_state = state

# Output the best accuracy and corresponding random_state
print(f"\nBest Accuracy: {best_accuracy} with random_state={best_random_state}")

# Final model using the best random_state
X_train, X_val, y_train, y_val = train_test_split(
    train_features, train_target, test_size=0.2, stratify=train_target, random_state=best_random_state
)

# Apply PolynomialFeatures to final train and validation data
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)

# Train the model
lr_model = LinearRegression()
lr_model.fit(X_train_poly, y_train)

# Make predictions on the test set
X_test_poly = poly.transform(test_df)
test_predictions = lr_model.predict(X_test_poly)

# Round predictions to 0 or 1 for binary classification
test_predictions_rounded = np.round(test_predictions)

# Prepare the submission file
submission_df = pd.DataFrame({
    'LoanID': pd.read_csv('test.csv')['LoanID'],  # Reload LoanID from the test CSV
    'Default': test_predictions_rounded  # Predicted defaults rounded to 0 or 1
})

# Save the predictions to a CSV file
submission_df.to_csv('polynomial_regression_predictions.csv', index=False)


Initial Training Data:
        LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  DRIRC89L0T   18  137576      209136          846              26   
1  TS0FIUNHNU   47   57194        5970          748              30   
2  I0YR284A1V   26   84328       95065          453               7   
3  WB1T7NQV8A   53   49795      229582          533             107   
4  J6GU9M4G1Z   49  115450       22072          840               0   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               2         10.47        60      0.81  High School   
1               2         19.72        36      0.73  High School   
2               2         24.25        12      0.45     Master's   
3               3         14.44        60      0.17   Bachelor's   
4               4         24.48        12      0.11   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0  Self-employed        Single         Yes            No    Business



RMSE for random_state=1: 0.30381115639154344
Accuracy for random_state=1: 0.8861611513608773




RMSE for random_state=2: 0.30260364290777186
Accuracy for random_state=2: 0.8864059134521245




RMSE for random_state=3: 0.30360406501501197
Accuracy for random_state=3: 0.8860387703152536




RMSE for random_state=4: 0.3032437651623321
Accuracy for random_state=4: 0.8862101037791267




RMSE for random_state=5: 0.30271371620992943
Accuracy for random_state=5: 0.8864793420794987




RMSE for random_state=6: 0.3037566587881436
Accuracy for random_state=6: 0.8860142941061289




RMSE for random_state=7: 0.3031646883937854
Accuracy for random_state=7: 0.8857450558057568




RMSE for random_state=8: 0.30356643848641884
Accuracy for random_state=8: 0.8855737223418837




RMSE for random_state=9: 0.30342434385823186
Accuracy for random_state=9: 0.8865038182886235




RMSE for random_state=10: 0.30257912198755216
Accuracy for random_state=10: 0.8859163892696299




RMSE for random_state=11: 0.3036885096462807
Accuracy for random_state=11: 0.885182102995888




RMSE for random_state=12: 0.30353862511933627
Accuracy for random_state=12: 0.885549246132759




RMSE for random_state=13: 0.30364288597752
Accuracy for random_state=13: 0.8858919130605052




RMSE for random_state=14: 0.30268085967195674
Accuracy for random_state=14: 0.8857205795966321




RMSE for random_state=15: 0.3030577483162537
Accuracy for random_state=15: 0.8862101037791267




RMSE for random_state=16: 0.3051226499453516
Accuracy for random_state=16: 0.8849618171137654




RMSE for random_state=17: 0.3028439654183168
Accuracy for random_state=17: 0.8866506755433718




RMSE for random_state=18: 0.3025913599883755
Accuracy for random_state=18: 0.886185627570002




RMSE for random_state=19: 0.30314371086187614
Accuracy for random_state=19: 0.8854268650871353




RMSE for random_state=20: 0.302872901093512
Accuracy for random_state=20: 0.8857940082240062




RMSE for random_state=21: 0.30300724218569614
Accuracy for random_state=21: 0.885818484433131




RMSE for random_state=22: 0.30381332200042066
Accuracy for random_state=22: 0.8859653416878793




RMSE for random_state=23: 0.3024468210141621
Accuracy for random_state=23: 0.8862101037791267




RMSE for random_state=24: 0.3034605138440652
Accuracy for random_state=24: 0.8861611513608773




RMSE for random_state=25: 0.3026547493527821
Accuracy for random_state=25: 0.886185627570002




RMSE for random_state=26: 0.30408294917374745
Accuracy for random_state=26: 0.8854268650871353




RMSE for random_state=27: 0.30365990807228765
Accuracy for random_state=27: 0.8859163892696299




RMSE for random_state=28: 0.30259488078525404
Accuracy for random_state=28: 0.8851331505776385




RMSE for random_state=29: 0.3027932667039123
Accuracy for random_state=29: 0.8859653416878793




RMSE for random_state=30: 0.30408507681554203
Accuracy for random_state=30: 0.885182102995888




RMSE for random_state=31: 0.3034820987140187
Accuracy for random_state=31: 0.8858919130605052




RMSE for random_state=32: 0.3032169170820118
Accuracy for random_state=32: 0.8861611513608773




RMSE for random_state=33: 0.3017371016342896
Accuracy for random_state=33: 0.886185627570002




RMSE for random_state=34: 0.30459658810154977
Accuracy for random_state=34: 0.8850597219502644




RMSE for random_state=35: 0.3055572834955118
Accuracy for random_state=35: 0.8852310554141375




RMSE for random_state=36: 0.3028195633166479
Accuracy for random_state=36: 0.8861611513608773




RMSE for random_state=37: 0.3039683778014902
Accuracy for random_state=37: 0.8852555316232622




RMSE for random_state=38: 0.3023203450471374
Accuracy for random_state=38: 0.8855737223418837




RMSE for random_state=39: 0.3025915365509458
Accuracy for random_state=39: 0.8855002937145094




RMSE for random_state=40: 0.30390613868114796
Accuracy for random_state=40: 0.8857205795966321




RMSE for random_state=41: 0.3030391236477725
Accuracy for random_state=41: 0.8857695320148815




RMSE for random_state=42: 0.30286117957401065
Accuracy for random_state=42: 0.8856716271783827




RMSE for random_state=43: 0.30293614743859437
Accuracy for random_state=43: 0.8864793420794987




RMSE for random_state=44: 0.30358513858336666
Accuracy for random_state=44: 0.8853779126688859




RMSE for random_state=45: 0.3038717998702496
Accuracy for random_state=45: 0.886087722733503




RMSE for random_state=46: 0.30391269732915877
Accuracy for random_state=46: 0.884912864695516




RMSE for random_state=47: 0.30297275284115077
Accuracy for random_state=47: 0.8865772469159976




RMSE for random_state=48: 0.3038828195230317
Accuracy for random_state=48: 0.8859163892696299




RMSE for random_state=49: 0.30325835808941226
Accuracy for random_state=49: 0.8862101037791267




RMSE for random_state=50: 0.3022139241412492
Accuracy for random_state=50: 0.8866751517524966




RMSE for random_state=51: 0.30424167431382776
Accuracy for random_state=51: 0.884912864695516




RMSE for random_state=52: 0.30434461868006846
Accuracy for random_state=52: 0.8857695320148815




RMSE for random_state=53: 0.30326159688224397
Accuracy for random_state=53: 0.8859898178970042




RMSE for random_state=54: 0.3036137927335713
Accuracy for random_state=54: 0.8854268650871353




RMSE for random_state=55: 0.30314474166260585
Accuracy for random_state=55: 0.8868954376346192




RMSE for random_state=56: 0.3037941187508204
Accuracy for random_state=56: 0.8854023888780106




RMSE for random_state=57: 0.30315617521711513
Accuracy for random_state=57: 0.8859163892696299




RMSE for random_state=58: 0.3027487924610622
Accuracy for random_state=58: 0.8857695320148815




RMSE for random_state=59: 0.30359318247950057
Accuracy for random_state=59: 0.8860142941061289




RMSE for random_state=60: 0.3031235517965626
Accuracy for random_state=60: 0.8858919130605052




RMSE for random_state=61: 0.30449397732117806
Accuracy for random_state=61: 0.8854023888780106




RMSE for random_state=62: 0.3023435106337982
Accuracy for random_state=62: 0.8859163892696299




RMSE for random_state=63: 0.3031561761086677
Accuracy for random_state=63: 0.8860387703152536




RMSE for random_state=64: 0.30304936463112614
Accuracy for random_state=64: 0.8862835324065009




RMSE for random_state=65: 0.3032047534143067
Accuracy for random_state=65: 0.8856226747601331




RMSE for random_state=66: 0.30221381276865383
Accuracy for random_state=66: 0.8862590561973761




RMSE for random_state=67: 0.30272529280328786
Accuracy for random_state=67: 0.8862101037791267




RMSE for random_state=68: 0.30312179064983297
Accuracy for random_state=68: 0.88545134129626




RMSE for random_state=69: 0.3034069639442309
Accuracy for random_state=69: 0.885549246132759




RMSE for random_state=70: 0.30210878610582975
Accuracy for random_state=70: 0.886185627570002




RMSE for random_state=71: 0.302037218830459
Accuracy for random_state=71: 0.8863324848247504




RMSE for random_state=72: 0.30270356086154104
Accuracy for random_state=72: 0.8861611513608773




RMSE for random_state=73: 0.3028614905288666
Accuracy for random_state=73: 0.8859408654787546




RMSE for random_state=74: 0.3039137119717592
Accuracy for random_state=74: 0.8853044840415116




RMSE for random_state=75: 0.3026493185515297
Accuracy for random_state=75: 0.8863324848247504




RMSE for random_state=76: 0.30370296620247006
Accuracy for random_state=76: 0.8856471509692578




RMSE for random_state=77: 0.3024736953601648
Accuracy for random_state=77: 0.8857205795966321




RMSE for random_state=78: 0.30344566820354113
Accuracy for random_state=78: 0.8857695320148815




RMSE for random_state=79: 0.3025219416317353
Accuracy for random_state=79: 0.8860387703152536




RMSE for random_state=80: 0.30211909759421524
Accuracy for random_state=80: 0.8854023888780106




RMSE for random_state=81: 0.30274159846097703
Accuracy for random_state=81: 0.8863080086156256




RMSE for random_state=82: 0.302427516562595
Accuracy for random_state=82: 0.8864793420794987




RMSE for random_state=83: 0.3041869886081976
Accuracy for random_state=83: 0.884814959859017




RMSE for random_state=84: 0.30350897706828533
Accuracy for random_state=84: 0.885182102995888




RMSE for random_state=85: 0.3044224911461305
Accuracy for random_state=85: 0.8856961033875074




RMSE for random_state=86: 0.3045662815704034
Accuracy for random_state=86: 0.8854268650871353




RMSE for random_state=87: 0.30326528330488944
Accuracy for random_state=87: 0.8864303896612493




RMSE for random_state=88: 0.3038945663136781
Accuracy for random_state=88: 0.8855002937145094




RMSE for random_state=89: 0.3026608627879154
Accuracy for random_state=89: 0.8857695320148815




RMSE for random_state=90: 0.30247755790598657
Accuracy for random_state=90: 0.8863324848247504




RMSE for random_state=91: 0.3020652236795091
Accuracy for random_state=91: 0.886993342471118




RMSE for random_state=92: 0.30328993440377605
Accuracy for random_state=92: 0.8857450558057568




RMSE for random_state=93: 0.30286747740255393
Accuracy for random_state=93: 0.8865282944977482




RMSE for random_state=94: 0.30366644902041356
Accuracy for random_state=94: 0.8859898178970042




RMSE for random_state=95: 0.30213696156139963
Accuracy for random_state=95: 0.887091247307617




RMSE for random_state=96: 0.30427272420614443
Accuracy for random_state=96: 0.8857205795966321




RMSE for random_state=97: 0.3022501656229225
Accuracy for random_state=97: 0.8864303896612493




RMSE for random_state=98: 0.30284477127178405
Accuracy for random_state=98: 0.8861366751517525




RMSE for random_state=99: 0.3045750882881358
Accuracy for random_state=99: 0.8855981985510084




RMSE for random_state=100: 0.3028430993159531
Accuracy for random_state=100: 0.8862101037791267

Best Accuracy: 0.887091247307617 with random_state=95


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

# Loading the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Displaying initial data (head)
print("Initial Training Data:\n", train_df.head())
print("Initial Test Data:\n", test_df.head())

# Drop unnecessary columns (including target column 'Default' from the feature set)
train_df = train_df.drop(columns=['LoanID', 'LoanTerm'])
test_df = test_df.drop(columns=['LoanID', 'LoanTerm'])

# Encode categorical columns using LabelEncoder
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

# Handle missing values
imputer = SimpleImputer(strategy='mean')
train_features = train_df.drop(columns=['Default'])  # Drop 'Default' from features
train_target = train_df['Default']  # 'Default' is the target variable
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

# Scale features
scaler = StandardScaler()
train_features = pd.DataFrame(scaler.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)

# Fixed random_state for reproducibility
random_state = 42

# Split the training data into train and validation sets with stratification
X_train, X_val, y_train, y_val = train_test_split(
    train_features, train_target, test_size=0.2, stratify=train_target, random_state=random_state
)

# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=random_state)
gb_model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = gb_model.predict(X_val)

# Calculate RMSE (Root Mean Squared Error)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation RMSE: {rmse}")

# Calculate accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy}")

# Make predictions on the test set
test_predictions = gb_model.predict(test_df)

# Prepare the submission file
submission_df = pd.DataFrame({
    'LoanID': pd.read_csv('test.csv')['LoanID'],  # Reload LoanID from the test CSV
    'Default': test_predictions  # Predicted defaults
})

# Save the predictions to a CSV file
submission_df.to_csv('gradient_boosting_predictions.csv', index=False)


Initial Training Data:
        LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  DRIRC89L0T   18  137576      209136          846              26   
1  TS0FIUNHNU   47   57194        5970          748              30   
2  I0YR284A1V   26   84328       95065          453               7   
3  WB1T7NQV8A   53   49795      229582          533             107   
4  J6GU9M4G1Z   49  115450       22072          840               0   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               2         10.47        60      0.81  High School   
1               2         19.72        36      0.73  High School   
2               2         24.25        12      0.45     Master's   
3               3         14.44        60      0.17   Bachelor's   
4               4         24.48        12      0.11   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0  Self-employed        Single         Yes            No    Business



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Display initial data (for inspection)
print("Initial Training Data:\n", train_df.head())
print("Initial Test Data:\n", test_df.head())

# Drop unnecessary columns
train_df = train_df.drop(columns=['LoanID', 'LoanTerm'])
test_df = test_df.drop(columns=['LoanID', 'LoanTerm'])

# Encode categorical columns using LabelEncoder
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

# Separate features and target in training data
train_features = train_df.drop(columns=['Default'])  # Features
train_target = train_df['Default']  # Target variable

# Handle missing values with mean imputation
imputer = SimpleImputer(strategy='mean')
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

# Scale features
scaler = StandardScaler()
train_features = pd.DataFrame(scaler.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_target, test_size=0.2, random_state=42)

# Initialize and train the Gradient Boosting model
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# Validate the model
y_val_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Prepare predictions on the test set
predictions = model.predict(test_df)

# Save the predictions in submission format
submission = sample_submission.copy()
submission['Default'] = predictions
submission.to_csv('submission.csv', index=False)

print("Submission file created at submission.csv")


Initial Training Data:
        LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  DRIRC89L0T   18  137576      209136          846              26   
1  TS0FIUNHNU   47   57194        5970          748              30   
2  I0YR284A1V   26   84328       95065          453               7   
3  WB1T7NQV8A   53   49795      229582          533             107   
4  J6GU9M4G1Z   49  115450       22072          840               0   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               2         10.47        60      0.81  High School   
1               2         19.72        36      0.73  High School   
2               2         24.25        12      0.45     Master's   
3               3         14.44        60      0.17   Bachelor's   
4               4         24.48        12      0.11   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0  Self-employed        Single         Yes            No    Business

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Display initial data (for inspection)
print("Initial Training Data:\n", train_df.head())
print("Initial Test Data:\n", test_df.head())

# Drop unnecessary columns
train_df = train_df.drop(columns=['LoanID', 'LoanTerm'])
test_df = test_df.drop(columns=['LoanID', 'LoanTerm'])

# Encode categorical columns using LabelEncoder
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

# Separate features and target in training data
train_features = train_df.drop(columns=['Default'])  # Features
train_target = train_df['Default']  # Target variable

# Handle missing values with mean imputation
imputer = SimpleImputer(strategy='mean')
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

# Scale features
scaler = StandardScaler()
train_features = pd.DataFrame(scaler.fit_transform(train_features), columns=train_features.columns)
test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_target, test_size=0.2, random_state=42)

# Initialize and train the Gradient Boosting model with default hyperparameters
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# Validate the model on the validation set
y_val_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Prepare predictions on the test set
predictions = model.predict(test_df)

# Save the predictions in submission format
submission = sample_submission.copy()
submission['Default'] = predictions
submission.to_csv('submission.csv', index=False)

print("Submission file created at submission.csv")


Initial Training Data:
        LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  DRIRC89L0T   18  137576      209136          846              26   
1  TS0FIUNHNU   47   57194        5970          748              30   
2  I0YR284A1V   26   84328       95065          453               7   
3  WB1T7NQV8A   53   49795      229582          533             107   
4  J6GU9M4G1Z   49  115450       22072          840               0   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               2         10.47        60      0.81  High School   
1               2         19.72        36      0.73  High School   
2               2         24.25        12      0.45     Master's   
3               3         14.44        60      0.17   Bachelor's   
4               4         24.48        12      0.11   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0  Self-employed        Single         Yes            No    Business