# Second Exercise
***
## Importing libraries and loading the diabetes dataset

In [1]:
from sklearn.datasets import load_diabetes
import pandas as pd 
import numpy as np

In [36]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

In [3]:
diabetes = load_diabetes()

In [4]:
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
target = diabetes.target

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


## Apply scaling in all the values
(even if they seen already scaled)

In [55]:
X = df.copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(0.95)
X_pca = pca.fit_transform(X_scaled)
y = target

In [56]:
X_pca.shape

(442, 8)

In [57]:
X_scaled.shape

(442, 10)

## Now create a function to find the best data organization training with cross validation

In [7]:
def find_best_R_Model(model, X, y):
    vals = cross_val_score(model, X, y, cv=5)
    kf = KFold(n_splits=5, shuffle=False)
    X_train, X_test, y_train, y_test = None, None, None, None
    
    
    best_ac = vals[0]
    p=0
    for i in range(1, len(vals)):
        if best_ac < vals[i]:
            best_ac, p = vals[i], i
            
    for fold_index, (train_index, test_index) in enumerate(kf.split(X_scaled)):
        if fold_index == p:
            X_train, X_test = X_scaled[train_index], X_scaled[test_index]
            y_train, y_test = y[train_index], y[test_index]
            break
    
    ML = model.fit(X_train, y_train)
    y_pred = ML.predict(X_test)
    df_res = pd.DataFrame({'y_pred': y_pred, 'y_test':y_test})
    print(f'The best model has an accuracy of {best_ac}')
    return ML, df_res

## Now manually use the function with diferents models until find the best one

In [58]:
for n in range(1, 1000, 50):
    find_best_R_Model(Ridge(alpha=n), X_scaled, y)
    print(f'alpha ={n}')

The best model has an accuracy of 0.5485373610813955
alpha =1
The best model has an accuracy of 0.5357060678852317
alpha =51
The best model has an accuracy of 0.5245858518291606
alpha =101
The best model has an accuracy of 0.5120733850136825
alpha =151
The best model has an accuracy of 0.4991864694815885
alpha =201
The best model has an accuracy of 0.48643852956039835
alpha =251
The best model has an accuracy of 0.47407467153756644
alpha =301
The best model has an accuracy of 0.4622033050841695
alpha =351
The best model has an accuracy of 0.45086249360170905
alpha =401
The best model has an accuracy of 0.4400540399120674
alpha =451
The best model has an accuracy of 0.42976143020446966
alpha =501
The best model has an accuracy of 0.4199594519260217
alpha =551
The best model has an accuracy of 0.41061939273299375
alpha =601
The best model has an accuracy of 0.4017241366957719
alpha =651
The best model has an accuracy of 0.3933479863928647
alpha =701
The best model has an accuracy of 0.38

In [59]:
find_best_R_Model(Lasso(alpha=0.1), X_scaled, y)

The best model has an accuracy of 0.5475554255992466


(Lasso(alpha=0.1),
         y_pred  y_test
 0   193.214191   272.0
 1   132.546883    60.0
 2    92.014934    54.0
 3   198.369825   221.0
 4    57.022838    90.0
 ..         ...     ...
 83  189.961941   178.0
 84  103.340345   104.0
 85  122.859336   132.0
 86  209.093838   220.0
 87   51.950161    57.0
 
 [88 rows x 2 columns])

In [60]:
find_best_R_Model(LinearRegression(), X_scaled, y)

The best model has an accuracy of 0.5502483366517519


(LinearRegression(),
         y_pred  y_test
 0   191.426638   272.0
 1   133.050924    60.0
 2    91.974891    54.0
 3   198.853295   221.0
 4    54.991185    90.0
 ..         ...     ...
 83  190.315527   178.0
 84  102.829586   104.0
 85  121.909659   132.0
 86  209.997683   220.0
 87   53.805908    57.0
 
 [88 rows x 2 columns])

In [62]:
params_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
          'gamma': [0.0001, 0.001, 0.01, 0.1],
          'kernel':['linear','rbf'] }

svr = SVR()
grid_search = GridSearchCV(svr, params_grid, cv=5, n_jobs=-1)

grid_search.fit(X_scaled, y)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Mejores parámetros:", best_params)
print("Mejor modelo:", best_model)
print("Mejor puntuación:", best_score)

find_best_R_Model(SVR(C=100, gamma=0.01, kernel='rbf'), X_scaled, y)

Mejores parámetros: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
Mejor modelo: SVR(C=100, gamma=0.01)
Mejor puntuación: 0.492289724463051
The best model has an accuracy of 0.549791630890502


(SVR(C=100, gamma=0.01),
         y_pred  y_test
 0   201.587839   272.0
 1   112.636412    60.0
 2    80.480533    54.0
 3   177.510130   221.0
 4    70.461327    90.0
 ..         ...     ...
 83  188.080820   178.0
 84   98.633282   104.0
 85  113.751034   132.0
 86  196.851921   220.0
 87   70.863577    57.0
 
 [88 rows x 2 columns])

In [64]:
find_best_R_Model(RandomForestRegressor(), X_scaled, y)

The best model has an accuracy of 0.5011297461174233


(RandomForestRegressor(),
     y_pred  y_test
 0    80.84   111.0
 1    82.35    98.0
 2   147.13   164.0
 3   163.58    48.0
 4    73.44    96.0
 ..     ...     ...
 84  105.45   101.0
 85  177.77   151.0
 86  121.82   127.0
 87  173.15   237.0
 88  210.66   225.0
 
 [89 rows x 2 columns])