## Training multiple ML models (Regression)

In [41]:
# import all the libraries
from myimports import *
from app import df
print('imported')

imported


In [42]:
# Pipeline
from sklearn.pipeline import Pipeline
# Mutual information
from sklearn.feature_selection import mutual_info_regression
# Train, validtion, test split
from sklearn.model_selection import train_test_split
# Loss calculation
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE
# Polynomial features
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
# Models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import xgboost as xgb

In [43]:
# Data
y = df['Conc']
X = df.drop('Conc', axis=1)

In [44]:
X.dtypes

Date        object
#Image      object
Path        object
Peaks_B      int64
Peaks_G      int64
Peaks_R      int64
Peaks_H      int64
Peaks_S      int64
Peaks_V      int64
Peaks_L      int64
Peaks_a      int64
Peaks_b      int64
pkPxls_B     int64
pkPxls_G     int64
pkPxls_R     int64
pkPxls_H     int64
pkPxls_S     int64
pkPxls_L     int64
pkPxls_V     int64
pkPxls_a     int64
pkPxls_b     int64
dtype: object

In [45]:
# label encoding for categorials
for colname in X.select_dtypes('object'):
    X[colname], _ = X[colname].factorize()

# discrete values are of type int
discrete_features = X.dtypes == int

In [46]:
y.dtypes # it is float

dtype('O')

In [47]:
# Finding mutual information
def make_mi_scores(X, y, discrete_feeatures):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name='MI scores', index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)
print(mi_scores)

pkPxls_G    1.185401
Path        1.138760
Peaks_B     0.928093
pkPxls_b    0.903410
pkPxls_V    0.879997
pkPxls_S    0.820708
pkPxls_R    0.808008
pkPxls_L    0.807249
pkPxls_B    0.718399
Peaks_a     0.678103
Peaks_G     0.677614
Peaks_L     0.676159
Peaks_S     0.628196
Peaks_R     0.595199
pkPxls_H    0.585760
Peaks_b     0.509953
Peaks_H     0.497665
pkPxls_a    0.458927
Peaks_V     0.451464
Date        0.041355
#Image      0.000000
Name: MI scores, dtype: float64


In [20]:
# splitting the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [21]:
# Define the maximum degree of polynomial features you want to explore
max_degree = 5

In [22]:
# models
models = {
    'Polynomial Regression': LinearRegression(),
    'Polynomial SVM': SVR(kernel='poly'),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', max_depth = 3)
}

In [23]:
for degree in range(1, max_degree+1):
    poly = PolynomialFeatures(degree=degree, include_bias = False)
    X_train_poly = poly.fit(X_train)
    X_val_poly = poly.transform(X_val)
    X_test_poly = poly.transform(X_test)
    X_val_poly = poly.transform(X_val)
    X_test_poly = poly.transform(X_test)

    # Loop through models
    for model_name, model in models.items():
        # train the model
        model.fit(X_train_poly, y_train)
        # make predictions on the validation set
        y_val_pred = model.predict(X_val_poly)
        # calculate evaluation metrics
        mse = MSE(y_val, y_val_pred)
        r2 = r2_score(y_val, y_val_pred)
        print(f'Degree:{degree}, Model: {model_name}')
        print(f'MSE: {mse}, R-squared: {r2}')
        print('-'*15)


ValueError: could not convert string to float: '2.jpg'