# Predicting Diabetes

### Paulo C. Rios Jr.  | Oct 23, 2017

In [1]:
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import cross_validation



In [3]:
# Load the diabetes dataset
diabetes = datasets.load_diabetes()

In [4]:
diabetes.keys()

dict_keys(['data', 'target', 'DESCR', 'feature_names'])

In [5]:
# Use only ONE feature
diabetes_X = diabetes.data
diabetes_Y = diabetes.target

In [6]:
# Split the data into training/testing sets
#diabetes_X_train = diabetes_X[:-20]
#diabetes_X_test = diabetes_X[-20:]

In [7]:
# Split the targets into training/testing sets
#diabetes_y_train = diabetes.target[:-20]
#diabetes_y_test = diabetes.target[-20:]

In [8]:
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(
                                                                                        diabetes_X,
                                                                                        diabetes_Y, 
                                                                                        test_size=0.2, 
                                                                                        random_state=1)

In [9]:
# Create linear regression object
regr = linear_model.LinearRegression()

In [10]:
#Simple K-Fold cross validation. 5 folds.
cv = cross_validation.KFold(len(diabetes_X_train), n_folds=5)

In [11]:
results = []
# "Error_function" can be replaced by the error function of your analysis
for traincv, testcv in cv:
        regr.fit(diabetes_X_train[traincv], diabetes_y_train[traincv]).predict(diabetes_X_train[testcv])
        #results.append( Error_function )

In [12]:
# Train the model using the training sets
#regr.fit(diabetes_X_train, diabetes_y_train)

In [13]:
# Make predictions using the train set
diabetes_y_pred_train = regr.predict(diabetes_X_train)

In [15]:
# Plot outputs of train group
#plt.scatter(diabetes_X_train, diabetes_y_train,  color='black')
#plt.plot(diabetes_X_train, diabetes_y_pred_train, color='blue', linewidth=3)
#plt.show()

In [16]:
# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

In [17]:
# The coefficients
print('Coefficients: \n', regr.coef_)

Coefficients: 
 [  10.93906701 -258.5357875   519.26054214  383.44996007 -608.40127912
  388.39622252 -132.25378375   31.46111047  577.42866905   67.04398482]


In [18]:
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))

Mean squared error: 3047.17


In [19]:
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, 
                                        diabetes_y_pred))

Variance score: 0.43


In [None]:
# Plot outputs of test group
#plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
#plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)
#plt.show()

### Exercício - Melhore a modelagem

1. Divida o conjunto em train e test sets aleatoriamente usando 20% para teste e veja como ficou agora a modelagem

2. Visualize a modelagem no conjunto de treinamento

3. Use validação cruzada e veja como fica agora a modelagem

4. Use todos as variáveis, repita os passos acima e veja como ficou agora a modelagem
