In [2]:
##----LOAD DATA ---------------------------------------------------------------
import pandas as pd
from sklearn.preprocessing import StandardScaler

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Define and scale features
X = train[['f1','f2','f3','f4','f5']].copy()

scaler = StandardScaler()
Xs = scaler.fit_transform(X)

# Define training target
y = train[['target']].copy()

# Define test features
X_test = test[['f1','f2','f3','f4','f5']].copy()
Xs_test = scaler.fit_transform(X_test)

In [3]:
##---- FIT A GLM USING 3 FEATURES AS 3RD ORDER POLYNOMIALS ----------------------------
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

# Create a pipeline with PolynomialFeatures and LinearRegression
degree = 3  # Degree of the polynomial features
glm1 = make_pipeline(PolynomialFeatures(degree), LinearRegression())

# Fit the model
glm1.fit(Xs[:, [2, 3, 4]], y)

# Perform cross-validation
cv_scores = cross_val_score(glm1, Xs[:, [2, 3, 4]], y, cv=5, scoring='neg_mean_squared_error')

# Convert the scores to positive since cross_val_score returns neg_mean_squared_error
cv_rmse_scores = (-cv_scores)**0.5

# Print the cross-validation results
print(f'Cross-Validation RMSE Scores: {cv_rmse_scores}')
print(f'Mean RMSE: {cv_rmse_scores.mean()}')

Cross-Validation RMSE Scores: [1.5931141  2.30062041 3.09987972 1.46457834 2.78350167]
Mean RMSE: 2.2483388468015915


In [5]:
##---- GENERATE PREDICTIONS AND EXPORT -------------------------------------------------------
# Predict
y_pred_glm1 = glm1.predict(Xs_test[:, [2, 3, 4]])

# Make a Dataframe
glm1_preds = test[['id']].copy()
glm1_preds['target'] = y_pred_glm1

# Export
glm1_preds.to_csv('glm1_preds.csv', index=False)

In [10]:
##---- USE GRID SEARCH TO OPTIMIZE GLM ------------------------------------
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

# Create a pipeline with PolynomialFeatures and Ridge regression
glm_model = make_pipeline(PolynomialFeatures(), Ridge())

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'polynomialfeatures__degree': [2, 3, 4],
    'ridge__alpha': [0.1, 1, 10]  # Adjust the range as needed
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=glm_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(Xs, y)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Hyperparameters: {'polynomialfeatures__degree': 3, 'ridge__alpha': 1}


In [13]:
##---- CREATE SECOND GLM WITH ABOVE HYPERPARAMETERS --------------------------

# Create a pipeline with PolynomialFeatures and Ridge regression using the best hyperparameters
degree = 3
alpha = 1

glm2 = make_pipeline(
    PolynomialFeatures(degree=degree),
    Ridge(alpha=alpha)
)

# Fit the model
glm2.fit(Xs, y)

# Perform cross-validation
cv_scores = cross_val_score(glm2, Xs, y, cv=5, scoring='neg_mean_squared_error')

# Convert the scores to positive since cross_val_score returns neg_mean_squared_error
cv_rmse_scores = (-cv_scores)**0.5

# Print the cross-validation results
print(f'Cross-Validation RMSE Scores: {cv_rmse_scores}')
print(f'Mean RMSE: {cv_rmse_scores.mean()}')

Cross-Validation RMSE Scores: [1.0619385  1.29727749 1.14691452 1.02052444 1.18961838]
Mean RMSE: 1.1432546646089168


In [14]:
##---- GENERATE PREDICTIONS AND EXPORT -------------------------------------------------------
# Predict
y_pred_glm2 = glm2.predict(Xs_test)

# Make a Dataframe
glm2_preds = test[['id']].copy()
glm2_preds['target'] = y_pred_glm2

# Export
glm2_preds.to_csv('glm2_preds.csv', index=False)

In [33]:
# Create a pipeline with PolynomialFeatures and Ridge regression using the best hyperparameters
degree = 3
alpha = 2

glm3 = make_pipeline(
    PolynomialFeatures(degree=degree),
    Ridge(alpha=alpha)
)

# Fit the model
glm3.fit(Xs, y)

# Perform cross-validation
cv_scores = cross_val_score(glm3, Xs, y, cv=5, scoring='neg_mean_squared_error')

# Convert the scores to positive since cross_val_score returns neg_mean_squared_error
cv_rmse_scores = (-cv_scores)**0.5

# Print the cross-validation results
print(f'Cross-Validation RMSE Scores: {cv_rmse_scores}')
print(f'Mean RMSE: {cv_rmse_scores.mean()}')

Cross-Validation RMSE Scores: [1.06528062 1.28682087 1.15159552 1.01500964 1.19315366]
Mean RMSE: 1.1423720621055555


In [34]:
# Predict
y_pred_glm3 = glm3.predict(Xs_test)

# Make a Dataframe
glm3_preds = test[['id']].copy()
glm3_preds['target'] = y_pred_glm3

# Export
glm3_preds.to_csv('glm3_preds.csv', index=False)

In [35]:
##---- SUPPORT VECTOR REGRESSOR --------------------------------------------------------
from sklearn.svm import SVR

# Create a pipeline with PolynomialFeatures and SVR
degree = 3
C = 1  # Regularization parameter
epsilon = 0.1  # Epsilon in the SVR model

svm_model = make_pipeline(
    PolynomialFeatures(degree=degree),
    SVR(kernel='linear', C=C, epsilon=epsilon)
)

# Define the parameter grid for hyperparameter tuning
param_grid_svr = {
    'polynomialfeatures__degree': [2, 3, 4],
    'svr__C': [0.1, 1, 10],  # Regularization parameter
    'svr__epsilon': [0.01, 0.1, 0.2]  # Epsilon in the SVR model
}

# Create the GridSearchCV object
grid_search_svr = GridSearchCV(estimator=svm_model, param_grid=param_grid_svr, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search_svr.fit(Xs, y)

# Print the best hyperparameters
print("Best Hyperparameters for SVR:", grid_search_svr.best_params_)

# Use the best model from the grid search
best_svr_model = grid_search_svr.best_estimator_

# Perform cross-validation with SVR
cv_scores_svr = cross_val_score(best_svr_model, Xs, y, cv=5, scoring='neg_mean_squared_error')

# Convert the scores to positive since cross_val_score returns neg_mean_squared_error
cv_rmse_scores_svr = (-cv_scores_svr)**0.5

# Print the cross-validation results for SVR
print(f'\nSVR Cross-Validation RMSE Scores: {cv_rmse_scores_svr}')
print(f'Mean RMSE for SVR: {cv_rmse_scores_svr.mean()}')


Fitting 5 folds for each of 27 candidates, totalling 135 fits


  y = column_or_1d(y, warn=True)


Best Hyperparameters for SVR: {'polynomialfeatures__degree': 3, 'svr__C': 1, 'svr__epsilon': 0.01}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



SVR Cross-Validation RMSE Scores: [1.06724701 1.38555173 1.17207283 0.96561582 1.20573637]
Mean RMSE for SVR: 1.1592447487855062


  y = column_or_1d(y, warn=True)


In [37]:
svr1 = best_svr_model
# Predict
y_pred_svr1 = svr1.predict(Xs_test)

# Make a Dataframe
svr1_preds = test[['id']].copy()
svr1_preds['target'] = y_pred_glm3

# Export
svr1_preds.to_csv('svr1_preds.csv', index=False)

In [None]:
###############################################################################################
##---- ATTEMPT TO IMPROVE VARIABLE SELECTION --------------------------------------------------
###############################################################################################

In [37]:
##----LOAD DATA ---------------------------------------------------------------
import pandas as pd
from sklearn.preprocessing import StandardScaler

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Define and scale features
X = train[['f1','f3','f4','f5','f7']].copy()

scaler = StandardScaler()
Xs = scaler.fit_transform(X)

# Define training target
y = train[['target']].copy()

# Define test features
X_test = test[['f1','f3','f4','f5','f7']].copy()
Xs_test = scaler.fit_transform(X_test)

In [5]:
##---- Look at feature importance with random forest --------------------
from sklearn.ensemble import RandomForestRegressor as rfr

# Define and scale subset
X1 = train[['f1','f2','f6','f7','f11','f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20']].copy()

scaler = StandardScaler()
X1s = scaler.fit_transform(X1)

# Initialize
rf = rfr(n_estimators=200, 
          random_state=42)

# Fit
rf.fit(X1s, y)

# Print feature importances
feature_importances = rf.feature_importances_

# Store features in a variable 'feature_names' and print importance
feature_names = X1.columns

for feature_name, importance in zip(feature_names, feature_importances):
    print(f'{feature_name}: {importance}')


  rf.fit(X1s, y)


f1: 0.04172323009911918
f2: 0.033333829738277254
f5: 0.5398523939710622
f6: 0.03922467963110824
f7: 0.024560297667447644
f11: 0.036256485103906506
f12: 0.04178144936887593
f13: 0.02661686919470644
f14: 0.028466535116066876
f15: 0.03345437618423797
f16: 0.03865936985643073
f17: 0.029500123816292956
f18: 0.021437261585121214
f19: 0.022972256861351032
f20: 0.04216084180599592


In [24]:
##---- USE GRID SEARCH TO OPTIMIZE GLM ------------------------------------
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# Create a pipeline with PolynomialFeatures and Ridge regression
glm_model = make_pipeline(PolynomialFeatures(), Ridge())

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'polynomialfeatures__degree': [2, 3, 4],
    'ridge__alpha': [0.1, 1, 10]  # Adjust the range as needed
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=glm_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(Xs, y)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Hyperparameters: {'polynomialfeatures__degree': 3, 'ridge__alpha': 1}


In [38]:
from sklearn.model_selection import cross_val_score

# Create a pipeline with PolynomialFeatures and Ridge regression using the best hyperparameters
degree = 3
alpha = 1

glm4 = make_pipeline(
    PolynomialFeatures(degree=degree),
    Ridge(alpha=alpha)
)

# Fit the model
glm4.fit(Xs, y)

# Perform cross-validation
cv_scores = cross_val_score(glm4, Xs, y, cv=5, scoring='neg_mean_squared_error')

# Convert the scores to positive since cross_val_score returns neg_mean_squared_error
cv_rmse_scores = (-cv_scores)**0.5

# Print the cross-validation results
print(f'Cross-Validation RMSE Scores: {cv_rmse_scores}')
print(f'Mean RMSE: {cv_rmse_scores.mean()}')

Cross-Validation RMSE Scores: [1.05952392 1.29289187 1.11381574 1.02088853 1.17030878]
Mean RMSE: 1.1314857671597869


In [39]:
# Predict
y_pred_glm4 = glm4.predict(Xs_test)

# Make a Dataframe
glm4_preds = test[['id']].copy()
glm4_preds['target'] = y_pred_glm4

# Export
glm4_preds.to_csv('glm4_preds.csv', index=False)

In [None]:
##---- VISUALIZING PREDICTORS --------------------------------------------

# Set up a 5x3 grid
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(20, 5))

# Flatten the 2D array of subplots for easier indexing
axes = axes.flatten()

# Variables 'f1' to 'f5'
variables = ['f1', 'f2', 'f3', 'f4', 'f5']

# Response variable
target_variable = 'target'

# Create scatter plots for each variable against 'target' in the grid
for i, variable in enumerate(variables):
    sns.scatterplot(x=train[variable], y=train[target_variable], ax=axes[i])
    axes[i].set_title(f'Scatter Plot for {variable} vs. {target_variable}')
    axes[i].set_xlabel(variable)
    axes[i].set_ylabel(target_variable)

# Adjust layout
plt.tight_layout()
plt.show()