# Import modules

In [None]:
import os
from os.path import join
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.model_selection import cross_val_score

# Read Data

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/AI\ Hack/data/

Mounted at /content/drive/
/content/drive/MyDrive/AI Hack/data


In [None]:
home = os.getcwd()
out = join(home, "output")
data = pd.read_csv(join(out, "WeatherData.csv"))
data

Unnamed: 0.1,Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,RainToday,RainTomorrow,WindSpeed,...,Temp,PersonID,sex,age,height,weight,BMI,freqOfExercise,CI,Target
0,0,17.9,35.2,0.0,12.0,12.3,48.0,0,0,6.0,...,26.6,1,0,40,175.80,87.75,28.39,6,2.307144,-1.805114
1,1,17.9,35.2,0.0,12.0,12.3,48.0,0,0,6.0,...,26.6,2,0,74,170.76,84.59,29.01,2,3.343987,-1.806799
2,2,17.9,35.2,0.0,12.0,12.3,48.0,0,0,6.0,...,26.6,3,0,24,176.49,81.89,26.29,4,2.459838,-1.802157
3,3,17.9,35.2,0.0,12.0,12.3,48.0,0,0,6.0,...,26.6,4,0,39,175.88,88.58,28.64,0,4.243744,-1.807501
4,4,17.9,35.2,0.0,12.0,12.3,48.0,0,0,6.0,...,26.6,5,0,61,173.37,89.91,29.91,1,4.302597,-1.810043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95155,47575,22.9,33.4,17.8,7.0,6.2,93.0,1,1,15.0,...,30.3,6,1,37,162.47,74.20,28.11,5,2.384639,-0.963873
95156,47576,22.9,33.4,17.8,7.0,6.2,93.0,1,1,15.0,...,30.3,7,1,30,162.40,70.60,26.77,2,3.139030,-0.964535
95157,47577,22.9,33.4,17.8,7.0,6.2,93.0,1,1,15.0,...,30.3,8,1,34,162.24,69.47,26.39,1,2.461945,-0.962891
95158,47578,22.9,33.4,17.8,7.0,6.2,93.0,1,1,15.0,...,30.3,9,1,22,163.19,66.04,24.80,0,4.014209,-0.963780


# Data Processing

In [9]:
#separate the response variable and predict variable
X = data.drop("Target", axis=1)
y = data["Target"]

#split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Regression Models

# Random Forest

In [None]:
#n_estimators is the number of trees, random_state is for random seed
rfr = RandomForestRegressor(n_estimators=100, random_state=42)

#train random forest regression model
rfr.fit(X_train, y_train)

#predict the result
rfr_pred = rfr.predict(X_test)

#MSE
rfr_mse = mean_squared_error(y_test, rfr_pred)
print(f'Mean Squared Error of Random Forest Regression: {rfr_mse}')

Mean Squared Error of Random Forest Regression: 1.1163131925032222e-05


# Logistic Regression

In [None]:
pipe1 = Pipeline([('scaler', StandardScaler()), ('regressor', LogisticRegression())])

param_grid = {
    'scaler': [StandardScaler(), Normalizer(), MinMaxScaler()],
    'regressor__Cs': range(),
    'regressor__penalty': ['l1', 'l2', 'elasticnet']
}



In [None]:
lr = LogisticRegression()

#train logistic regression model
lr.fit(X_train, y_train)

#predict the result
lr_pred = lr.predict(X_test)

#MSE
lr_mse = mean_squared_error(y_test, lr_pred)
print(f'Mean Squared Error of Logistic Regression: {lr_mse}')

# Support Vector Regression

In [None]:
#kernel: linear/poly/rbf/sigmoid/precomputed
#C: regularization parameter (larger C: more penalization, overfitting; smaller C: smoother, underfitting)
svr = SVR(kernel='linear', C=1.0)

#train support vector regression model
svr.fit(X_train, y_train)

#predict the result
svr_pred = svr.predict(X_test)

#MSE
svr_mse = mean_squared_error(y_test, svr_pred)
print(f'Mean Squared Error of Support Vector Regression: {svr_mse}')

# XGBoost

In [None]:
#n_estimators is the number of boosting stages to perform
#max_depth is the max depth of the individual regression estimators
XGBr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

#XGBoost regression model
XGBr.fit(X_train, y_train)

#predict the result
XGBr_pred = XGBr.predict(X_test)

#MSE
XGBr_mse = mean_squared_error(y_test, XGBr_pred)
print(f'Mean Squared Error of XGBoost Regression: {XGBr_mse}')

# MLPRegressor

In [12]:
from sklearn.neural_network import MLPRegressor

In [None]:
pipeMLP = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', MLPRegressor())])

params = {
    'scaler': [StandardScaler(), Normalizer(), MinMaxScaler()],
    'regressor__hidden_layer_sizes': range(1, 10),
    'regressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'regressor__learning_rate': ['constant', 'invscaling', 'adaptive']
}

grid_search = GridSearchCV(pipeMLP, params, cv = 10, scoring='neg_mean_squared_error', n_jobs = -1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

#evaluate the best model
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Set: {mse}')

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-c2a91b6ac588>", line 13, in <cell line: 13>
    grid_search.fit(X_train, y_train)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py", line 874, in fit
    self._run_search(evaluate_candidates)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py", line 1388, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py", line 821, in evaluate_candidates
    out = parallel(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
  File "/usr/local/lib/python3.10/dist-packages/joblib/parallel.py", line 1952, in __call__
    retur

In [None]:
import pickle

with open(join(out,'MLP.model'), 'wb') as file:
    pickle.dump(best_model, file)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-29478a3d24cc>", line 4, in <cell line: 3>
    pickle.dump(best_model, file)
NameError: name 'best_model' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/usr/local/lib/python3.10/dist-packages/IPy

# Multioutput regression

In [None]:
#https://scikit-learn.org/stable/modules/multiclass.html#multioutput-regression

In [None]:
#example from sklearn document
from sklearn.datasets import make_regression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
X, y = make_regression(n_samples=10, n_targets=3, random_state=1)
MultiOutputRegressor(GradientBoostingRegressor(random_state=0)).fit(X, y).predict(X)

# Average

In [None]:
#ensemble predictions using simple averaging
ensemble_predictions = np.mean([rfr_pred, lr_pred, svr_pred], axis=0)

#evaluate the ensemble model
ensemble_mse = mean_squared_error(y_test, ensemble_predictions)
print(f'Ensemble Mean Squared Error: {ensemble_mse}')

# Pipeline

In [None]:
#define pipeline with preprocessing and regression model steps
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('regressor', LogisticRegression())  # Initial regression model
])

#parameter range
param_grid = {
    # LogisticRegression(),
    'regressor': [RandomForestRegressor(), GradientBoostingRegressor()],
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [3, 5, 7]
}

#find the best model and parameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

#evaluate the best model
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Set: {mse}')

#perform cross-validation to get more robust performance estimates
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='mean_squared_error', n_jobs=-1)
print(f'Cross-validated Mean Squared Error: {np.mean(-cv_scores)}')

###visulization: parameter tunning or prediction errors?
from yellowbrick.model_selection import CVScores
from yellowbrick.regressor import PredictionError

#result visulization
cv_visualizer = CVScores(grid_search, scoring='mean_squared_error')
cv_visualizer.fit(X, y)
cv_visualizer.show()

#visulization of prediction errors
model_visualizer = PredictionError(grid_search)
model_visualizer.fit(X_train, y_train)
model_visualizer.score(X_test, y_test)
model_visualizer.show()

KeyboardInterrupt: ignored