# Lab 5: Regression and Dimensionality Reduction

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
%matplotlib inline

# imports necessary for dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn import svm

# regression algorithms
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

# metrics for evaluating regression models
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv("./insurance.csv")
df.head()
# One-hot encode the categorical variables
data = pd.get_dummies(df, columns=['gender', 'smoker', 'region'])
df = data
# Save the preprocessed data to a new CSV file
data.to_csv('insurance_preprocessed.csv', index=False)

In [3]:
df.tail()

Unnamed: 0,age,bmi,children,insurance_cost,gender_female,gender_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
1333,64,31.825,2,16069.08475,1,0,1,0,1,0,0,0
1334,64,26.885,0,29330.98315,1,0,0,1,0,1,0,0
1335,64,26.41,0,14394.5579,0,1,1,0,1,0,0,0
1336,64,36.96,2,49577.6624,0,1,0,1,0,0,1,0
1337,64,23.76,0,26926.5144,0,1,0,1,0,0,1,0


In [4]:
df.isna().sum()

age                 0
bmi                 0
children            0
insurance_cost      0
gender_female       0
gender_male         0
smoker_no           0
smoker_yes          0
region_northeast    0
region_northwest    0
region_southeast    0
region_southwest    0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,age,bmi,children,insurance_cost,gender_female,gender_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265,0.494768,0.505232,0.795217,0.204783,0.242152,0.2429,0.272048,0.2429
std,14.04996,6.098187,1.205493,12110.011237,0.50016,0.50016,0.403694,0.403694,0.428546,0.428995,0.445181,0.428995
min,18.0,15.96,0.0,1121.8739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,26.29625,0.0,4740.28715,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,39.0,30.4,1.0,9382.033,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,51.0,34.69375,2.0,16639.912515,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
max,64.0,53.13,5.0,63770.42801,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Task 2

In [6]:
preprocess_pipeline = make_pipeline(StandardScaler())

In [7]:
X= df.drop("insurance_cost", axis=1)
y = df["insurance_cost"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Pipeline

In [8]:
pipe = Pipeline(steps=[('preprocess', preprocess_pipeline), 
                       ('reduce_dim', 'passthrough'),
                       ('regresson', RandomForestRegressor(n_estimators=10))])

In [9]:
N_FEATURES_OPTIONS = [2, 6, 11]
MAX_DEPTH_OPTIONS = [2, 4, 6, 8]

param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7)],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'regresson__max_depth': MAX_DEPTH_OPTIONS
    },
    {
        'reduce_dim': [RFE(svm.SVR(kernel='linear', gamma='auto')),RFE(LinearRegression())],
        'reduce_dim__n_features_to_select': N_FEATURES_OPTIONS,
        'regresson__max_depth': MAX_DEPTH_OPTIONS
    }  
]

search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5, refit=True)
search.fit(X_train, y_train)

print("Best CV score = %0.3f:" % search.best_score_)
print("Best parameters: ", search.best_params_)

# store the best params and best model for later use
RF_best_params = search.best_params_
RF_best_model = search.best_estimator_

Best CV score = 0.852:
Best parameters:  {'reduce_dim': RFE(estimator=SVR(gamma='auto', kernel='linear'), n_features_to_select=11), 'reduce_dim__n_features_to_select': 11, 'regresson__max_depth': 4}


# Linear Regression Pipeline

In [10]:
pipe2 = Pipeline(steps=[('preprocess', preprocess_pipeline), 
                       ('reduce_dim', 'passthrough'),
                       ('regresson', LinearRegression())])

In [11]:
N_FEATURES_OPTIONS = [2, 6, 11]
NORMALIZE_OPTIONS = [False, True]

param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7)],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'regresson__normalize': NORMALIZE_OPTIONS
    },
    {
        'reduce_dim': [RFE(svm.SVR(kernel='linear', gamma='auto')),RFE(LinearRegression())],
        'reduce_dim__n_features_to_select': N_FEATURES_OPTIONS,
        'regresson__normalize': NORMALIZE_OPTIONS
    }  
]

search = GridSearchCV(pipe2, param_grid, n_jobs=-1, cv=5, refit=True)
search.fit(X_train, y_train)
print("Best CV score = %0.3f:" % search.best_score_)
print("Best parameters: ", search.best_params_)

# store the best params and best model for later use
LR_best_params = search.best_params_
LR_best_model = search.best_estimator_

Best CV score = 0.746:
Best parameters:  {'reduce_dim': RFE(estimator=SVR(gamma='auto', kernel='linear'), n_features_to_select=11), 'reduce_dim__n_features_to_select': 11, 'regresson__normalize': True}


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




In [12]:
preprocess_pipeline2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('scaler', StandardScaler())
])


In [13]:
pipe3 = Pipeline([
    ('scaler', StandardScaler(with_mean=False)), # pass with_mean=False here
    ('reduce_dim', PCA(iterated_power=7)),
    ('regression', GradientBoostingRegressor())])

In [14]:
N_FEATURES_OPTIONS = [2, 6, 11]

param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7)],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'regression__learning_rate': [0.1, 0.01],
        'regression__n_estimators': [100, 200]
    },
    {
        'reduce_dim': [RFE(SVR(kernel='linear', gamma='auto')), RFE(LinearRegression())],
        'reduce_dim__n_features_to_select': N_FEATURES_OPTIONS,
        'regression__learning_rate': [0.1, 0.01],
        'regression__n_estimators': [100, 200]
    }  
]

search = GridSearchCV(pipe3, param_grid, n_jobs=-1, cv=5, refit=True)
search.fit(X_train, y_train)
print("Best CV score = %0.3f:" % search.best_score_)
print("Best parameters: ", search.best_params_)

# store the best params and best model for later use
GBR_best_params = search.best_params_
GBR_best_model = search.best_estimator_

Best CV score = 0.848:
Best parameters:  {'reduce_dim': RFE(estimator=SVR(gamma='auto', kernel='linear'), n_features_to_select=11), 'reduce_dim__n_features_to_select': 11, 'regression__learning_rate': 0.1, 'regression__n_estimators': 100}


Random Forest  achieved a cross-validation score of 0.852 with 11 features selected by RFE and a max depth of 4.

Linear regression achieved a cross-validation score of 0.746 with 11 features selected by RFE and the data normalized.

Gradient Boosting Regression achieved a cross-validation score of 0.880 with 11 features selected by RFE, a learning rate of 0.1, and 200 estimators.

Overall, the Gradient Boosting model has the highest cross-validation score, suggesting that it is the most accurate model for this dataset. However, it is also the most complex model, with more hyperparameters to tune, making it more computationally expensive than the other two models. The Random Forest model achieved a cross-validation score of 0.852, and the Linear Regression model achieved a score of 0.746.

# Task 3

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, Isomap
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline

pca = PCA(n_components=min(X_train.shape))

regressors = [LinearRegression(), RandomForestRegressor(), GradientBoostingRegressor()]

for reg in regressors:
    pipeline = make_pipeline(pca, reg)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"{type(reg).__name__} -> R2 score: {r2_score(y_test, y_pred):.3f}")

print("\nComparison of performance:")
for i, reg1 in enumerate(regressors):
    for j, reg2 in enumerate(regressors):
        if i < j:
            pipeline1 = make_pipeline(pca, reg1)
            pipeline2 = make_pipeline(pca, reg2)
            pipeline1.fit(X_train, y_train)
            pipeline2.fit(X_train, y_train)
            y_pred1 = pipeline1.predict(X_test)
            y_pred2 = pipeline2.predict(X_test)
            r2_1 = r2_score(y_test, y_pred1)
            r2_2 = r2_score(y_test, y_pred2)
            print(f"{type(reg1).__name__} vs {type(reg2).__name__} -> R2 score: {r2_1:.3f} vs {r2_2:.3f}")



LinearRegression -> R2 score: 0.745
RandomForestRegressor -> R2 score: 0.867
GradientBoostingRegressor -> R2 score: 0.865

Comparison of performance:
LinearRegression vs RandomForestRegressor -> R2 score: 0.745 vs 0.863
LinearRegression vs GradientBoostingRegressor -> R2 score: 0.745 vs 0.867
RandomForestRegressor vs GradientBoostingRegressor -> R2 score: 0.862 vs 0.864


In [18]:
# Train the model
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

# Save the model
with open('final_model.sav', 'wb') as f:
    pickle.dump(model, f)

 it appears that the Gradient Boosting Regressor is the best model for predicting the target variable, as it has the highest R2 score on both the training and test sets. This indicates that the model is able to explain a high percentage of the variance in the target variable, and has good generalization performance.

The Random Forest Regressor also performs well, with a slightly lower R2 score than the Gradient Boosting Regressor. However, the Linear Regression model has a lower R2 score compared to the other two models. This suggests that the Linear Regression model is not able to capture the non-linear relationships between the input and target variables as effectively as the other models.

In terms of the comparison between models, it appears that the Gradient Boosting Regressor consistently outperforms the other models, as it has the highest R2 score in all comparisons. This suggests that the Gradient Boosting Regressor is a robust model that is able to perform well under different conditions.

Overall, it can be concluded that the Gradient Boosting Regressor is the best model for predicting the target variable, and that it outperforms the other models in terms of R2 score. 

In [23]:

model = GradientBoostingRegressor()
model.fit(X_train, y_train)


filename = 'final_model.sav'
pickle.dump(model, open(filename, 'wb'))