In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

### CSCI 447 Fall 2023 Kaggle Competition

In [None]:
train = pd.read_csv('data/kaggle_train.csv')
test = pd.read_csv('data/kaggle_test.csv')
# example = pd.read_csv('data/kaggle_example_submission.csv')

In [None]:
train

In [None]:
# show features that correlate most with target
corr_values_sorted = train.corr()['target'].sort_values(ascending=False)
# plot correlation values
plt.figure(figsize=(10, 10))
plt.barh(corr_values_sorted.index, corr_values_sorted)
plt.gca().invert_yaxis()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# Plot the target histogram
plt.hist(train['target'], bins=30, density=True, alpha=0.7, color='blue', label='Target')

# Generate data for the normal distribution
mu, sigma = train['target'].mean(), train['target'].std()
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
y = 1/(sigma * np.sqrt(2*np.pi)) * np.exp(-(x - mu)**2 / (2*sigma**2))

# Plot the normal distribution
plt.plot(x, y, color='red', label='Normal Distribution')

# Set labels and title
plt.xlabel('Target')
plt.ylabel('Density')
plt.title('Target Histogram with Normal Distribution')
plt.legend()

# Show the plot
plt.show()

In [None]:
# plot the distribution of all the features
plt.figure(figsize=(10, 10))
for i in range(1, 21):
    plt.subplot(5, 4, i)
    plt.hist(train.iloc[:, i], bins=100)
    plt.title(train.columns[i])
plt.tight_layout()
plt.show()

In [None]:
# drop rows with target value outside of -30 to 30
train = train[train['target'].between(-30, 30)]
len(train)

---

In [None]:
# # Remove features that correlate weakly with target (run this to do that)
# train_correlated = train.copy()
# train_correlated = train_correlated.drop(['f14', 'f13','f11'], axis=1)

# train = train_correlated

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [None]:
# # get column names to re add after scaling
# train_cols = train.columns
# test_cols = test.columns

# # get target col to re add after scaling
# train_target = train['target']

# # get id col to re add after scaling
# train_id = train['id']
# test_id = test['id']

In [None]:
targets = train.target
test_ids = test.id
# standardize data with mean 0 and std 1
train_mean = train.mean()
train_std = train.std()
train = (train - train_mean) / train_std
test = (test - train_mean) / train_std
# add target back
train['target'] = targets
# add ids back
# test['id'] = test_ids
train.describe()

In [None]:
# # add column names back
# train = pd.DataFrame(train, columns=train_cols)
# test = pd.DataFrame(test, columns=test_cols)
# # add target col back
# train['target'] = train_target
# # add id col back
# train['id'] = train_id
# test['id'] = test_id
# train

#### XGBoost:

In [None]:
# split data into train and test sets
X = train.drop(['target', 'id'], axis=1)
y = train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('RMSE:', np.sqrt(mse))

In [None]:
# # grid search on xgb
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# params = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [1, 2, 3, 4, 5],
#     'min_child_weight': [1, 2, 3],
#     'gamma': [0, 0.1, 0.2],
#     'subsample': [0.5, 0.75, 1],
#     'colsample_bytree': [0.5, 0.75, 1],
#     'reg_alpha': [0, 0.1, 0.2],
#     'reg_lambda': [0, 0.1, 0.2]
# }
# grid = GridSearchCV(estimator=xgb, param_grid=params, scoring='neg_mean_squared_error', cv=5, verbose=1)
# # grid = RandomizedSearchCV(estimator=xgb, param_distributions=params, scoring='neg_mean_squared_error', cv=5, verbose=1)
# grid.fit(X_train, y_train)
# print(grid.best_params_)
# print(grid.best_score_)
# best_xgb = grid.best_estimator_
# y_pred = best_xgb.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# print('MSE:', mse)

In [None]:
xgb_best_so_far = XGBRegressor(n_estimators=500, learning_rate=0.2, max_depth=1, min_child_weight=1, gamma=0, reg_lambda=0, reg_alpha=0, subsample=0.75, colsample_bytree=0.5, random_state=42, feature_names=X_train.columns)
xgb_best_so_far.fit(X_train, y_train)
y_pred = xgb_best_so_far.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('RMSE:', np.sqrt(mse))

#### FEATURE ENGINEERING:

In [None]:
# find most important features
from xgboost import plot_importance
plot_importance(xgb_best_so_far)
plt.show()

In [None]:
# xgb with feature selection
from sklearn.feature_selection import SelectFromModel

# feature names
feature_names = X_train.columns

# select features using threshold
selection = SelectFromModel(xgb_best_so_far, threshold=0.0001, prefit=True)
select_X_train = selection.transform(X_train)
select_X_test = selection.transform(X_test)

# train model
selection_model = XGBRegressor(n_estimators=500, learning_rate=0.2, max_depth=1, min_child_weight=1, gamma=0, reg_lambda=0, reg_alpha=0, subsample=0.75, colsample_bytree=0.5, random_state=42)
selection_model.fit(select_X_train, y_train)

# eval model
y_pred = selection_model.predict(select_X_test)
mse = mean_squared_error(y_test, y_pred)
print('RMSE:', np.sqrt(mse))

In [None]:
train_reduced = train[['f4','f5','f1','f3','target']]

X = train_reduced.drop(['target'], axis=1)
y = train_reduced['target']

# add some polynomial features
# X['f1f3'] = X['f1'] * X['f3']

X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(X, y, test_size=0.2, random_state=42)
xgb_best_so_far.fit(X_train_reduced, y_train_reduced)
y_pred = xgb_best_so_far.predict(X_test_reduced)
mse = mean_squared_error(y_test_reduced, y_pred)
print('RMSE:', np.sqrt(mse))
print('MSE:', mse)

In [None]:
# plot predictions vs actual
plt.scatter(y_test_reduced, y_pred)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

In [None]:
# predict test set and save to csv with ID, target
test_reduced = test[['id', 'f4','f5','f1','f3']]
test_reduced = test_reduced.drop(['id'], axis=1)
predictions = xgb_best_so_far.predict(test_reduced)
predictions = pd.DataFrame(predictions, columns=['target'])
predictions = pd.concat([test, predictions], axis=1)
# predictions = predictions[['target']]
predictions = predictions[['id', 'target']]
predictions

In [None]:
predictions.to_csv('data/kaggle_submission_bova_6.csv', index=False)

---

## Neural Network

In [9]:
# neural network MLP with feature selection and hyperparameter tuning
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

X = train.drop(['target', 'id'], axis=1)
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# grid search on mlp
params = {
    'hidden_layer_sizes': [(100,), (100, 100), (100, 100, 100)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}
grid = GridSearchCV(estimator=MLPRegressor(random_state=42, max_iter=10000), param_grid=params, scoring='neg_mean_squared_error', cv=5, verbose=1)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)
best_mlp = grid.best_estimator_
y_pred = best_mlp.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('RMSE:', np.sqrt(mse))

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


