# About the Diabete dataset

This a REGRESSION problem.
Ten numeric predictive variables: age, sex, body mass index, average blood pressure, and six blood serum measurements. They were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure (integer between 25 and 346) of disease progression one year after baseline.

## The goal is to predict as well as possible the future disease progression one year after (target value) as a function of the 10 predictive variables.

Note: Each of the 10 feature variables have been mean centered and scaled by the standard deviation times n_samples (i.e. the sum of squares of each column totals 1).


## Loading and pre-processing Diabete dataset



In [1]:
#Packages
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets


# Load the diabetes dataset
diabetes = datasets.load_diabetes()

# Print dataset description
print(diabetes.DESCR)

# Input vectors
diabetes_X = diabetes.data

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]


Diabetes dataset

Notes
-----

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

Data Set Characteristics:

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attributes:
    :Age:
    :Sex:
    :Body mass index:
    :Average blood pressure:
    :S1:
    :S2:
    :S3:
    :S4:
    :S5:
    :S6:

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani

## Baseline method: linear regression

In [63]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# The coefficients
print('Coefficients: \n', regr.coef_)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The mean squared error on test set
print("Mean squared error (on test set): %.2f"
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))

# Explained variance score: 1 is perfect prediction
print('Variance score (max_value=1 for perfect prediction): %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))


Coefficients: 
 [ 3.03499549e-01 -2.37639315e+02  5.10530605e+02  3.27736980e+02
 -8.14131709e+02  4.92814588e+02  1.02848452e+02  1.84606489e+02
  7.43519617e+02  7.60951722e+01]
Mean squared error (on test set): 2004.57
Variance score (max_value=1 for perfect prediction): 0.59


## Support Vector Regression (SVR) 

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve

svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.6)

parameters = {'kernel':('linear', 'rbf','poly','sigmoid'), 
              'C':[10, 100, 200,1000],
             'gamma':[0.1, 1, 3, 5, 10],
             'degree':[3,4,6,8,6],
             'shrinking':[True, False] }

svr=GridSearchCV(SVR(), parameters, cv=5, verbose=0)

svr.fit(diabetes_X_train,diabetes_y_train)
best_params=svr.best_params_
print('Best parameters')
print()
print(svr.best_params_)

svr_lin = SVR(kernel='linear', C=1e3)
svr_poly = SVR(kernel='poly', C=1e3, degree=2)
svr_sigmoid=SVR(kernel='sigmoid',gamma=0.1,tol=1e-5)

#Choix du regresseur SVR
svr_num=0
#J'ai pas trouvé de switch case :(

if svr_num == 1:  
    svr = svr_rbf
elif svr_num == 2:
     svr = svr_lin
elif svr_num == 2:
    svr = svr_poly 
elif svr_num == 2:
    svr = svr_sigmoid;
else:
    svr = svr_rbf
    
svr=SVR(kernel=best_params['kernel'],C=best_params['C'],gamma=best_params['gamma'],
        degree=best_params['degree'],shrinking=best_params['shrinking'])
       
y_pred = svr.fit(diabetes_X_train, diabetes_y_train).predict(diabetes_X_train)
score_svr=svr.score(diabetes_X_test,diabetes_y_test)
score_reg=regr.score(diabetes_X_test,diabetes_y_test)

print("SVR R^2= ",score_svr)
print("Baseline method R^2= ",score_reg)

In [122]:
plt.figure()
plt.imshow(svr.cv_results_['mean_test_score'].reshape(len(parameters['kernel']), 
                                                      len(parameters['C'])), interpolation='none')
plt.xlabel('C', fontsize=14)
plt.ylabel('noyau', fontsize=14)
plt.title("Score", fontsize=14)
plt.xticks(np.arange(len(parameters['C'])), parameters['C'], rotation=45)
plt.yticks(np.arange(len(parameters['kernel'])), parameters['kernel'], rotation=45)
plt.colorbar()

plt.figure()
plt.imshow(svr.cv_results_['mean_test_score'].reshape(len(parameters['kernel']), 
                                                      len(parameters['gamma'])), interpolation='none')
plt.xlabel('gamma', fontsize=14)
plt.ylabel('noyau', fontsize=14)
plt.title("Score", fontsize=14)
plt.xticks(np.arange(len(parameters['gamma'])), parameters['gamma'], rotation=45)
plt.yticks(np.arange(len(parameters['kernel'])), parameters['kernel'], rotation=45)
plt.colorbar()

AttributeError: 'SVR' object has no attribute 'cv_results_'

<Figure size 432x288 with 0 Axes>

In [None]:
from sklearn.model_selection import learning_curve

plt.figure()
for i in range(np.size(diabetes_X_train,1)):
    plt.subplot(2,5,i+1)
    plt.scatter(diabetes_X_train[:,i],diabetes_y_train,label='Training data')
    plt.scatter(diabetes_X_train[:,i],y_pred,label='Model')
    plt.legend()
plt.show()

regr = linear_model.LinearRegression()
train_sizes, train_scores_svr, test_scores_svr = \
    learning_curve(svr, diabetes.data, diabetes.target, train_sizes=np.linspace(0.1, 1, 50),
                   scoring="neg_mean_squared_error", cv=10)
train_sizes_abs, train_scores_kr, test_scores_kr = \
    learning_curve(regr, diabetes.data, diabetes.target, train_sizes=np.linspace(0.1, 1, 50),
                   scoring="neg_mean_squared_error", cv=10)

plt.figure()
plt.plot(train_sizes, -test_scores_svr.mean(1),'--', color="r",
             label="SVR")
plt.plot(train_sizes, -test_scores_kr.mean(1), '--', color="g",
             label="REGR")
plt.xlabel("Train size")
plt.ylabel("Mean Squared Error")
plt.title('Learning curves')
plt.legend(loc="best")

plt.show()

print("SVR R^2= ",score_svr)
print("Baseline method R^2= ",score_reg)



{'mean_fit_time': array([0.00702755, 0.00882999, 0.00066431, 0.00547306, 0.00402554,
        0.00627796, 0.005229  , 0.00272799, 0.01321133, 0.00687083,
        0.00558249, 0.00338777, 0.0123992 , 0.        , 0.        ,
        0.01319671, 0.00505757, 0.0068229 , 0.01010521, 0.01306931,
        0.01159398, 0.01180832, 0.00737079, 0.01528478, 0.02498881,
        0.01609031, 0.00598184, 0.00372736, 0.00962512, 0.02766991,
        0.00553656, 0.02211817]),
 'std_fit_time': array([0.00337985, 0.00228776, 0.00093948, 0.0038784 , 0.00328677,
        0.00374999, 0.00603935, 0.00176841, 0.00951226, 0.00652433,
        0.00789484, 0.00410496, 0.00203034, 0.        , 0.        ,
        0.0098726 , 0.00138544, 0.00708081, 0.00715766, 0.00539561,
        0.0059994 , 0.00539107, 0.0021871 , 0.00151316, 0.01253466,
        0.0021327 , 0.00427923, 0.00268905, 0.00544379, 0.00710565,
        0.00782987, 0.00379909]),
 'mean_score_time': array([0.00202147, 0.00360211, 0.005392  , 0.00591874, 0.002960

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])

In [133]:
f=open("text.txt",'w')
for i in range(10):
    data=diabetes.data[:,i]
    print(data)
    f.write(str(data)+'\n')
f.close

[ 0.03807591 -0.00188202  0.08529891 -0.08906294  0.00538306 -0.09269548
 -0.04547248  0.06350368  0.04170844 -0.07090025 -0.09632802  0.02717829
  0.01628068  0.00538306  0.04534098 -0.05273755 -0.00551455  0.07076875
 -0.0382074  -0.02730979 -0.04910502 -0.0854304  -0.0854304   0.04534098
 -0.06363517 -0.06726771 -0.10722563 -0.02367725  0.05260606  0.06713621
 -0.06000263 -0.02367725  0.03444337  0.03081083  0.01628068  0.04897352
  0.01264814 -0.00914709 -0.00188202 -0.00188202  0.00538306 -0.09996055
 -0.06000263  0.01991321  0.04534098  0.02717829 -0.05637009 -0.07816532
  0.06713621 -0.04183994  0.03444337  0.05987114 -0.05273755 -0.00914709
 -0.04910502 -0.04183994 -0.04183994 -0.02730979  0.04170844  0.06350368
 -0.07090025 -0.04183994 -0.02730979 -0.03457486  0.06713621 -0.04547248
 -0.00914709  0.04170844  0.03807591  0.01628068 -0.00188202 -0.00188202
  0.06350368  0.01264814  0.01264814 -0.00914709 -0.03094232 -0.09632802
  0.00538306 -0.10359309  0.07076875  0.01264814 -0

<function TextIOWrapper.close()>