In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
import copy
from torchmetrics.functional import r2_score as torch_r2_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
!pip install xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



# NOTE: This is the notebook for the best model (Kaggle score: 0.88243) analysis. Please see the other notebooks for the analysis on the other following components :
## 1. Feature Engineering (Kaggle score: 0.87587)
### Notebook: IT5006_Grp17_Kaggle_Feature_Engineering_XGBoost_Regressor
### - Reason:Reducing features can decrease the model’s accuracy as the model could reduce/remove important features’ statistical significance to the dependent variable.

## 2. Outliers Management (Capping Outliers) (Kaggle score: 0.87679)
### Notebook:  IT5006_Grp17_Kaggle_Capping_Outliers_XGBoost_Regressor
### - Reason: While it reduces variance of the dataset, it also increases the bias of the model. With more values centered within a certain range, the model is more likely to predict similar values.

## 3. Feature Engineering + Capping Outliers (Kaggle score: 0.87180)
### Notebook:IT5006_Grp17_Kaggle_Feature_Engineering_and_Capping_Outliers_XGBoost_Regressor
### - Reason: The combined reasons from 1. and 2. results it in achieving a lower score than 1/2.

## 4. RandomForest Regressor (Kaggle score: 0.86810)
### Notebook: IT5006_Grp17_Kaggle_RandomForestRegressor
### - Reason: XGBoost Regressor’s iterative ‘boosting’ technique allows it to correct the previous tree’s errors unlike RandomForestRegressor’s ‘bagging’ technique, which trains each tree independently.

## 5. Neural Network (Kaggle score: 0.84498)
### Notebook: IT5006_Grp17_Kaggle_NeuralNetwork
### - Reason: Compared to XGBoost Regressor/RandomForestRegressor , Neural Network requires a huge amount of datapoints to give accurate predictions.

# Importing the data

In [8]:
df = pd.read_csv("train.csv")

In [10]:

features = ['Tuition_in_state','Tuition_out_state','Faculty_salary','Pell_grant_rate','SAT_average','ACT_50thPercentile','pct_White','pct_Black','pct_Hispanic','pct_Asian','Parents_middlesch','Parents_highsch','Parents_college'] 
X = df[features]
y = df["Completion_rate"]

#Comment out when Sending to Kaggle
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data Preprocessing

In [11]:
#Comment out when training
scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X)

In [None]:
#Comment out when sending to Kaggle
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

In [13]:
#Real X_Test
real_X_test = pd.read_csv("x_test.csv")
real_X_test = real_X_test.drop(columns=['Unnamed: 0'])
real_X_test = scaler.transform(real_X_test)

# Create XGBRegressor

In [23]:
# XGB_REG = xgb.XGBRegressor()
# XGB_REG

In [24]:
# XGB_model = XGB_REG.fit(X_train, y_train)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

In [25]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

### XGBOOST Hyperparameter tuning

Researching Online, we have identified the following Hyperparameters to improve prediction for the XGBRegressor Model:
1. n_estimators (Number of trees. Generally, higher value results in better prediction at the expense of computational load)
2. learning_rate (Model learning speed, lower value makes the model more conservative)
3. max_depth (Maximum number of levels in tree, higher values could cause the model to learn the specifics of the dataset)
4. colsample_bytree (Subsample ratio of columns during constructing of each tree.) 
5. subsample (Percentage of data points used for each tree building)

We then use GridSearchCV to identify the best parameters for each hyperparameter

References:
Aman Gupta,'XGBoost Hyperparameters — Explained',Apr 28, 2021. Available: https://amangupta16.medium.com/xgboost-hyperparameters-explained-bb6ce580501d

PRASHANT BANERJEE , A Guide on XGBoost hyperparameters tuning . Available:
https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning

### 1st Optimization Run

In [26]:
# xgb_param_grid = { 
#     'n_estimators': [100,200,300],
#     'max_depth' : [6,50,80,100],
#     'learning_rate' :[0.01,0.1,1],
#     'colsample_bytree':[0.5,1],
#     'subsample': [0.5,1]
    
    
# }

# xgb_tuned = xgb.XGBRegressor()
# CV_xgb_REG = GridSearchCV(estimator=xgb_tuned, param_grid=xgb_param_grid, scoring ='neg_root_mean_squared_error')
# CV_xgb_REG.fit(X_train, y_train)
# CV_xgb_REG.best_params_

#Output:
# {'colsample_bytree': 1,
#  'learning_rate': 0.1,
#  'max_depth': 6,
#  'n_estimators': 300,
#  'subsample': 0.5}

### 2nd Optimization Run

In [27]:
# xgb_param_grid = { 
#     'max_depth' : [6,12,18,24],
#     'learning_rate' :[0.05,0.1,0.2,0.3],
#     'subsample': [0.25,0.5,0.75],
#     'colsample_bytree':[0.75,1]
# }

# xgb_tuned = xgb.XGBRegressor()
# CV_xgb_REG = GridSearchCV(estimator=xgb_tuned, param_grid=xgb_param_grid, scoring ='neg_root_mean_squared_error')
# CV_xgb_REG.fit(X_train, y_train)
# CV_xgb_REG.best_params_

#Output:
# {'colsample_bytree': 1,
#  'learning_rate': 0.05,
#  'max_depth': 18,
#  'subsample': 0.5}
n_estimators=500,subsample = 0.5, max_depth = 18, learning_rate = 0.05, colsample_bytree = 1

### 3rd Optimization Run (Worse Kaggle Results than 2nd Optimization Run)

In [None]:
#Tune Further but got worse results
# xgb_param_grid = { 
#     'max_depth' : [16,18,22],
#     'learning_rate' :[0.01,0.02,0.05,0.1],
#     'subsample': [0.4,0.5,0.6],
#     'colsample_bytree':[0.9,1]
# }

# xgb_tuned = xgb.XGBRegressor()
# CV_xgb_REG = GridSearchCV(estimator=xgb_tuned, param_grid=xgb_param_grid, scoring ='neg_root_mean_squared_error')
# CV_xgb_REG.fit(X_train, y_train)
# CV_xgb_REG.best_params_

#Output:
# {'colsample_bytree': 0.9,
#  'learning_rate': 0.05,
#  'max_depth': 16,
#  'subsample': 0.6}

## Using 2nd Optimization Run Results

n_estimator =100

In [28]:
# XGB_model = xgb.XGBRegressor(n_estimators=100,subsample = 0.5, max_depth = 18, learning_rate = 0.05, colsample_bytree = 1)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9938037663958779

In [29]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

#Output: 0.06447693132608343

#### n_estimator =200

In [31]:
# XGB_model = xgb.XGBRegressor(n_estimators=200,subsample = 0.5, max_depth = 18, learning_rate = 0.05, colsample_bytree = 1)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9995539824582018

In [32]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

#Output: 0.06298032373966458

#### n_estimator =300

In [34]:
# XGB_model = xgb.XGBRegressor(n_estimators=300,subsample = 0.5, max_depth = 18, learning_rate = 0.05, colsample_bytree = 1)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9999552315632131

In [35]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

#Output: 0.06271371604497766

#### n_estimator =400

In [37]:
# XGB_model = xgb.XGBRegressor(n_estimators=400,subsample = 0.5, max_depth = 18, learning_rate = 0.05, colsample_bytree = 1)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9999899588199448

In [38]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

#Output: 0.06264671722423437

#### n_estimator =500

In [40]:
# XGB_model = xgb.XGBRegressor(n_estimators=500,subsample = 0.5, max_depth = 18, learning_rate = 0.05, colsample_bytree = 1)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9999924606907223

In [41]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

#Output: 0.06264351871349641

#### n_estimator =600

In [43]:
# XGB_model = xgb.XGBRegressor(n_estimators=600,subsample = 0.5, max_depth = 18, learning_rate = 0.05, colsample_bytree = 1)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9999930600734026

In [44]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

#Output: 0.06263977749084422

n_ estimator = 500 is chosen as n_estimator=600 reflects negligible diminishing improvements to the model at the cost of greatly increased model fitting/predicting time

## Using 3rd Optimization Run Results

n_estimator =100

In [None]:
# XGB_model = xgb.XGBRegressor(n_estimators=100,subsample = 0.6, max_depth = 16, learning_rate = 0.05, colsample_bytree = 0.9)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9963762435678043

In [None]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

#Output: 0.064695597455068

n_estimator =200

In [None]:
# XGB_model = xgb.XGBRegressor(n_estimators=200,subsample = 0.6, max_depth = 16, learning_rate = 0.05, colsample_bytree = 0.9)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9998258953598808

In [None]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

#Output: 0.06343080669921651

n_estimator =300

In [None]:
# XGB_model = xgb.XGBRegressor(n_estimators=300,subsample = 0.6, max_depth = 16, learning_rate = 0.05, colsample_bytree = 0.9)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9999848584075754

In [None]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

#Output: 0.06325678921736058

n_estimator =400

In [5]:
# XGB_model = xgb.XGBRegressor(n_estimators=400,subsample = 0.6, max_depth = 16, learning_rate = 0.05, colsample_bytree = 0.9)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9999912089285614

In [None]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

#Output: 0.06324358385906466

n_estimator =500

In [None]:
# XGB_model = xgb.XGBRegressor(n_estimators=500,subsample = 0.6, max_depth = 16, learning_rate = 0.05, colsample_bytree = 0.9)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9999921281769216

In [None]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

#Output: 0.06323821545142379

n_estimator =600

In [None]:
# XGB_model = xgb.XGBRegressor(n_estimators=600,subsample = 0.6, max_depth = 16, learning_rate = 0.05, colsample_bytree = 0.9)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9999924786560821

In [None]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE
#Output: 0.0632375430390593

n_estimator =700

In [None]:
# XGB_model = xgb.XGBRegressor(n_estimators=700,subsample = 0.6, max_depth = 16, learning_rate = 0.05, colsample_bytree = 0.9)
# XGB_model.fit(X_train, y_train)
# xgb_r_sq = XGB_model.score(X_train, y_train)
# print(xgb_r_sq)

#Output: 0.9999927786912266

In [None]:
# #Using model to obtain predictions for the Train Test split
# y_pred_TEST_XGB_model = XGB_model.predict(X_test)

# #Testing RMSE error for the Train Test split
# RMSE = mean_squared_error(y_test, y_pred_TEST_XGB_model, squared=False)
# RMSE

#Output: 0.06323892537258512

n_estimator = 600 is chosen to represent 3rd Optimization Run as n_estimator = 700 has worse RMSE (0.06323892537258512) for the test dataset compared to that of the n_estimator = 600 model (0.0632375430390593).

## Evaluation

While the 3rd Optimization Run gives a model with higher training r_sq score (0.9999924786560821) in the training data set as compared to that of the 2nd Optimization Run model (0.9999924606907223), it gives a worse test RMSE (0.0632375430390593) for the test data set compared to that of the 2nd Optimization Run model (0.06264351871349641). This highlights the fact that the 3rd Optimization Run has overfitted the data. Hence, the 2nd Optimization Run model is chosen as the final model for the Kaggle Competition.

In [48]:
# xgb_param_grid_v2 = { 
#     'gamma': [0,2,5,7,10]
# }

# xgb_tuned = xgb.XGBRegressor(n_estimators=500,subsample = 0.5, max_depth = 18, learning_rate = 0.05, colsample_bytree = 1)
# CV_xgb_REG_v2 = GridSearchCV(estimator=xgb_tuned, param_grid=xgb_param_grid_v2, scoring ='neg_root_mean_squared_error')
# CV_xgb_REG_v2.fit(X_test, y_test)
# CV_xgb_REG_v2.best_params_

#Output: {'gamma': 0}

From GridSearchCv, it is shown that increasing the gamma results in worse scores using the test dataset, hence it shows that our model is already well fitted and reducing the variance any further only serve to increase the bias of the model which reduces the predictive quality of the model.

# Using Final Model to make prediction for Kaggle Submission

In [53]:
XGB_model = xgb.XGBRegressor(n_estimators=500,subsample = 0.5, max_depth = 18, learning_rate = 0.05, colsample_bytree = 1)
XGB_model.fit(X_train, y.to_numpy())
true_y_pred_TEST_XGB_tuned = XGB_model.predict(real_X_test)
print(true_y_pred_TEST_XGB_tuned)

[0.42577797 0.44045326 0.4640063  ... 0.58051133 0.59470075 0.6397243 ]


### Creating our submission

In [56]:
submission = pd.DataFrame.from_dict({'Completion_rate': true_y_pred_TEST_XGB_tuned})
submission

Unnamed: 0,Completion_rate
0,0.425778
1,0.440453
2,0.464006
3,0.425185
4,0.472050
...,...
1160,0.481713
1161,0.506208
1162,0.580511
1163,0.594701


In [57]:
submission.to_csv('submission.csv', index = True, index_label = "id") 

### To submit your submission

On the panel on your right, select the drop down "Submit to competition", and submit this notebook. Ensure that your submission is named ```submission.csv```!