In [295]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.spatial
from collections import Counter
from sklearn.model_selection import learning_curve 
from sklearn.metrics import make_scorer
from sklearn.linear_model import \
    Lasso, Ridge, LassoCV,LinearRegression
from sklearn.preprocessing import \
    StandardScaler, PolynomialFeatures
from sklearn.model_selection import \
    KFold, RepeatedKFold, GridSearchCV, \
    cross_validate, train_test_split
%matplotlib inline

In [296]:
df = pd.read_csv(r"C:\Users\nikit\Documents\machine-learning\1-lab\winequalityN_preprocessed.csv", sep=",")
df

Unnamed: 0,Тип вина,Фиксированная кислотность,Летучая кислотность,Лимонная кислота,Остаточный сахар,Хлориды,Свободный диоксид серы,Общий диоксид серы,Плотность вина,pH,Сульфаты,Содеражние алкоголя,Качество вина
0,0,8.1,0.240,0.26,11.0,0.043,41.0,211.0,0.99676,3.11,0.49,10.0,6
1,0,8.6,0.200,0.42,1.5,0.041,35.0,125.0,0.99250,3.11,0.49,11.4,7
2,1,10.7,0.670,0.22,2.7,0.107,17.0,34.0,1.00040,3.28,0.98,9.9,6
3,1,7.9,0.340,0.36,1.9,0.065,5.0,10.0,0.99419,3.27,0.54,11.2,7
4,1,10.3,0.410,0.42,2.4,0.213,6.0,14.0,0.99940,3.19,0.62,9.5,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6458,0,7.2,0.210,1.00,1.1,0.154,46.0,114.0,0.99310,2.95,0.43,9.2,6
6459,0,6.9,0.190,0.35,1.7,0.036,33.0,101.0,0.99315,3.21,0.54,10.8,7
6460,1,9.0,0.450,0.49,2.6,0.084,21.0,75.0,0.99870,3.35,0.57,9.7,5
6461,0,9.4,0.280,0.30,1.6,0.045,36.0,139.0,0.99534,3.11,0.49,9.3,5


## Нормализация и подготовка данных

In [297]:
df_edited = df.drop(['Содеражние алкоголя'], axis=1)
X = df_edited.values
y = df.iloc[:, -2].values
y

array([10. , 11.4,  9.9, ...,  9.7,  9.3, 10.3])

In [298]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [299]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Подбор гиперпараметров для линейной регрессии

In [300]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {"eta0": [ .001, .003, .01, .03, .1, .3, 1, 3]} 
clf = SGDRegressor(tol=.0001) 
n_iter_search = 8 
random_search = RandomizedSearchCV(clf, 
                                   param_distributions=param_dist, 
                                   n_iter=n_iter_search, 
                                   cv=3, 
                                   scoring='neg_mean_squared_error') 
random_search.fit(X_train, y_train) 

print("Лучшие параметры: {}".format(random_search.best_params_)) 
print("Лучшая оценка MSE: {}".format(random_search.best_score_))

Лучшие параметры: {'eta0': 0.003}
Лучшая оценка MSE: -0.2652941872616004


## Линейная регрессия

###  [Источник](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html)

In [301]:
clf = SGDRegressor(tol=.001, eta0=0.01) 
clf.fit(X_train, y_train)

SGDRegressor()

In [302]:
y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred) 
print("MSE: ",mse)
print("RMSE: {}".format(np.sqrt(mse)))
mae = mean_absolute_error(y_test, y_pred)
print("MAE: ",mae)

MSE:  0.22721691379172404
RMSE: 0.476672753355721
MAE:  0.35244543837297776


In [337]:
from sklearn.metrics import r2_score
print("R2: ", r2_score(y_test, y_pred))

R2:  0.8391266836497246


In [304]:
y_pred

array([11.75527296, 11.91923631, 10.512787  , ..., 10.26010139,
        9.3987487 , 12.67568494])

## Регрессия дерева решений

In [305]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)

DecisionTreeRegressor()

In [306]:
y_pred = regressor.predict(X_test)

In [307]:
df=pd.DataFrame({'Содержание алкоголя':y_test, 'Предсказание':y_pred})
df

Unnamed: 0,Содержание алкоголя,Предсказание
0,11.4,11.8
1,12.0,12.0
2,10.8,11.7
3,8.7,9.0
4,12.8,13.0
...,...,...
1288,12.5,11.0
1289,11.2,11.2
1290,9.9,9.9
1291,9.2,9.3


In [308]:
mse = mean_squared_error(y_test, y_pred) 
print("MSE: ",mse)
print("RMSE: {}".format(np.sqrt(mse)))
mae = mean_absolute_error(y_test, y_pred)
print("MAE: ",mae)

MSE:  0.32614741770940103
RMSE: 0.5710931777822259
MAE:  0.33793503479040987


In [336]:
print("R2: ", r2_score(y_test, y_pred))

R2:  0.8391266836497246


## Lasso

In [310]:
clf = linear_model.LassoCV()
clf.fit(X_train, y_train)

LassoCV()

In [311]:
y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred) 
print("MSE: ",mse)
print("RMSE: {}".format(np.sqrt(mse)))
mae = mean_absolute_error(y_test, y_pred)
print("MAE: ",mae)

MSE:  0.22340981744044067
RMSE: 0.4726624772926667
MAE:  0.3469065378577377


In [335]:
print("R2: ", r2_score(y_test, y_pred))

R2:  0.8391266836497246


In [313]:
df=pd.DataFrame({'Содержание алкоголя':y_test, 'Предсказание':y_pred})
df

Unnamed: 0,Содержание алкоголя,Предсказание
0,11.4,11.745868
1,12.0,11.872220
2,10.8,10.459933
3,8.7,9.412428
4,12.8,12.193295
...,...,...
1288,12.5,10.528099
1289,11.2,11.245661
1290,9.9,10.258426
1291,9.2,9.410978


## Гребневая регрессия

In [314]:
from sklearn.linear_model import RidgeCV

clf = linear_model.RidgeCV()
clf.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]))

In [315]:
y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred) 
print("MSE: ",mse)
print("RMSE: {}".format(np.sqrt(mse)))
mae = mean_absolute_error(y_test, y_pred)
print("MAE: ",mae)

MSE:  0.22442249611281304
RMSE: 0.4737325153636945
MAE:  0.3485937136319756


In [334]:
print("R2: ", r2_score(y_test, y_pred))

R2:  0.8391266836497246


In [317]:
df=pd.DataFrame({'Содержание алкоголя':y_test, 'Предсказание':y_pred})
df

Unnamed: 0,Содержание алкоголя,Предсказание
0,11.4,11.744449
1,12.0,11.865922
2,10.8,10.470379
3,8.7,9.423969
4,12.8,12.182821
...,...,...
1288,12.5,10.528480
1289,11.2,11.248846
1290,9.9,10.268056
1291,9.2,9.412825


## Elastic Net регрессия

In [318]:
from sklearn.linear_model import ElasticNetCV
clf = linear_model.ElasticNetCV()
clf.fit(X_train, y_train)

ElasticNetCV()

In [319]:
y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred) 
print("MSE: ",mse)
print("RMSE: {}".format(np.sqrt(mse)))
mae = mean_absolute_error(y_test, y_pred)
print("MAE: ",mae)

MSE:  0.22378216859996183
RMSE: 0.47305620025527817
MAE:  0.34751133764567266


In [333]:
print("R2: ", r2_score(y_test, y_pred))

R2:  0.8391266836497246


In [321]:
df=pd.DataFrame({'Содержание алкоголя':y_test, 'Предсказание':y_pred})
df

Unnamed: 0,Содержание алкоголя,Предсказание
0,11.4,11.746795
1,12.0,11.871460
2,10.8,10.464831
3,8.7,9.416065
4,12.8,12.189735
...,...,...
1288,12.5,10.528938
1289,11.2,11.246448
1290,9.9,10.261248
1291,9.2,9.409978


## Реализация алгоритма линейной регрессии

In [326]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [340]:
X

array([[ 0.   ,  8.1  ,  0.24 , ...,  3.11 ,  0.49 ,  6.   ],
       [ 0.   ,  8.6  ,  0.2  , ...,  3.11 ,  0.49 ,  7.   ],
       [ 1.   , 10.7  ,  0.67 , ...,  3.28 ,  0.98 ,  6.   ],
       ...,
       [ 1.   ,  9.   ,  0.45 , ...,  3.35 ,  0.57 ,  5.   ],
       [ 0.   ,  9.4  ,  0.28 , ...,  3.11 ,  0.49 ,  5.   ],
       [ 0.   ,  4.8  ,  0.225, ...,  3.31 ,  0.4  ,  6.   ]])

In [344]:
class LinearRegression() :
      
    def __init__(self, learning_rate, iterations) :       
        self.learning_rate = learning_rate         
        self.iterations = iterations
          
    # Обучение модели              
    def fit(self, X, y) :                   
        self.m, self.n = X.shape
          
        # Инициализация весов          
        self.W = np.zeros(self.n)         
        self.b = 0          
        self.X = X          
        self.y = y
                     
        for i in range(self.iterations) :          
            self.update_weights()             
        return self
      
    def update_weights(self) :             
        y_pred = self.predict(self.X)         
        # подсчет градиентов        
        dW = - ( 2 * ( self.X.T ).dot( self.y - y_pred )  ) / self.m       
        db = - 2 * np.sum( self.y - y_pred ) / self.m    
        
        # пересчет весов      
        self.W = self.W - self.learning_rate * dW      
        self.b = self.b - self.learning_rate * db
          
        return self
      
    # Функция предсказания   
    def predict( self, X ) :
      
        return X.dot( self.W ) + self.b

In [339]:
model = LinearRegression(iterations = 1000, learning_rate = 0.01)
model.fit(X_train, y_train)
      
# Предсказания
  
y_pred = model.predict(X_test)
      
mse = mean_squared_error(y_test, y_pred) 
print("MSE: ",mse)
print("RMSE: {}".format(np.sqrt(mse)))
mae = mean_absolute_error(y_test, y_pred)
print("MAE: ",mae)

print("R2: ", r2_score(y_test, y_pred))

MSE:  0.22986985577291963
RMSE: 0.47944744839546244
MAE:  0.3770361587579432
R2:  0.8391266836497246
