[https://habr.com/ru/company/mlclass/blog/247751/]
#### Введение в машинное обучение с помощью Python и Scikit-Learn

##### pima-indians-diabetes
6     Pregnancies
148   Glucose
72    BloodPressure
35    SkinThickness
0     Insulin
33.6  BMI
0.627 DiabetesPedigreeFunction
50    Age
1     Class
Этот набор данных описывает медицинские записи индейцев пима и показывает, будет ли у каждого пациента возникать диабет в течение пяти лет.
Описание полей следующее:
preg = количество беременных
Plas = концентрация глюкозы в плазме через 2 часа при оральном тесте на толерантность к глюкозе
прес = диастолическое артериальное давление (мм рт.ст.)
Кожа = Толщина кожной складки трицепса (мм)
тест = 2-часовой сывороточный инсулин (мю Ед / мл)
масса = индекс массы тела (вес в кг / (рост в м) ^ 2)
pedi = диабет родословной
возраст = возраст (лет)
класс = переменная класса (1: положительный тест на диабет, 0: отрицательный тест на диабет)

В первую очередь данные необходимо загрузить в оперативную память, чтобы мы имели возможность работать с ними. Сама библиотека Scikit-Learn использует в своей реализации NumPy массивы, поэтому будем загружать *.csv файлы средствами NumPy. Загрузим один из датасетов из репозитория UCI Machine Learning Repository:

In [1]:
import numpy as np
import pandas as pd
from urllib.request import urlopen
# url with dataset
# url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
raw_data = pd.read_csv('data/pima-indians-diabetes.csv')
# download the file
# raw_data = urlopen(url)
# load the CSV file as a numpy matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import pandas_profiling
# pandas_profiling.ProfileReport(raw_data)
raw_data.describe()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
count,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0
mean,3.842243,120.859192,69.101695,20.517601,79.90352,31.990482,0.471674,33.219035,0.34811
std,3.370877,31.978468,19.368155,15.954059,115.283105,7.889091,0.331497,11.752296,0.476682
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.2435,24.0,0.0
50%,3.0,117.0,72.0,23.0,32.0,32.0,0.371,29.0,0.0
75%,6.0,140.0,80.0,32.0,127.5,36.6,0.625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [3]:
raw_data.head
dataset = raw_data
# dataset.head()
# dataset.describe()

In [13]:
# dataset = np.loadtxt(raw_data, delimiter=",")
dataset = pd.DataFrame(raw_data)
# separate the data from the target attributes
X = dataset.iloc[:,:7]
# X = dataset[:,:7]
y = dataset.iloc[:,8]
# y = dataset[:,:8]

Unnamed: 0,6,148,72,35,0,33.6,0.627
0,1,85,66,29,0,26.6,0.351
1,8,183,64,0,0,23.3,0.672
2,1,89,66,23,94,28.1,0.167
3,0,137,40,35,168,43.1,2.288
4,5,116,74,0,0,25.6,0.201
...,...,...,...,...,...,...,...
762,10,101,76,48,180,32.9,0.171
763,2,122,70,27,0,36.8,0.340
764,5,121,72,23,112,26.2,0.245
765,1,126,60,0,0,30.1,0.349


Далее во всех примерах будем работать с этим набором данных, а именно с матрицей обьект-признак X и значениями целевой переменной y.
#### Нормализация данных
Всем хорошо знакомо, что большинство градиентных методов (на которых по-сути и основаны почти все алгоритмы машинного обучения) сильно чуствительны к шкалированию данных. Поэтому перед запуском алгоритмов чаще всего делается либо нормализация, либо так называемая стандартизация. Нормализация предполагает замену номинальных признаков так, чтобы каждый из них лежал в диапазоне от 0 до 1. Стандартизация же подразумевает такую предобработку данных, после которой каждый признак имеет среднее 0 и дисперсию 1. В Scikit-Learn уже есть готовые для этого функции:

In [14]:
from sklearn import preprocessing
# normalize the data attributes
normalized_X = preprocessing.normalize(X)
# standardize the data attributes
standardized_X = preprocessing.scale(X)

In [15]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)
# display the relative importance of each attribute
print(model.feature_importances_)

[0.12604006 0.27987607 0.11985899 0.08670605 0.08579532 0.16101595
 0.14070756]


In [16]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(X, y)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[ True False False False False  True  True]
[1 2 3 5 4 1 1]


In [17]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       500
           1       0.74      0.55      0.63       267

   micro avg       0.78      0.78      0.78       767
   macro avg       0.77      0.72      0.74       767
weighted avg       0.77      0.78      0.77       767

[[449  51]
 [120 147]]


In [18]:
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

GaussianNB(priors=None, var_smoothing=1e-09)
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       500
           1       0.69      0.60      0.64       267

   micro avg       0.77      0.77      0.77       767
   macro avg       0.75      0.73      0.73       767
weighted avg       0.76      0.77      0.76       767

[[429  71]
 [108 159]]


In [19]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       500
           1       0.77      0.63      0.69       267

   micro avg       0.81      0.81      0.81       767
   macro avg       0.79      0.77      0.78       767
weighted avg       0.80      0.81      0.80       767

[[449  51]
 [ 98 169]]


In [20]:
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      1.00      1.00       267

   micro avg       1.00      1.00      1.00       767
   macro avg       1.00      1.00      1.00       767
weighted avg       1.00      1.00      1.00       767

[[500   0]
 [  0 267]]


In [21]:
from sklearn import metrics
from sklearn.svm import SVC
# fit a SVM model to the data
model = SVC()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      1.00      1.00       267

   micro avg       1.00      1.00      1.00       767
   macro avg       1.00      1.00      1.00       767
weighted avg       1.00      1.00      1.00       767

[[500   0]
 [  0 267]]


In [23]:
import numpy as np
from sklearn.linear_model import Ridge
# from sklearn.grid_search import GridSearchCV  # OLD
from sklearn.model_selection import GridSearchCV
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 0.e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
0.28158141247308044
1.0


In [25]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
# from sklearn.grid_search import GridSearchCV  # OLD
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'alpha': sp_rand()}
# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(X, y)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
          fit_params=None, iid='warn', n_iter=100, n_jobs=None,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B8262B0B00>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)
0.28158140483146743
0.9996839908911314
