### Машинное обучение

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler

import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('default')

In [3]:
df = pd.read_csv('table_res.csv')

print(df.head())

   Unnamed: 0        Дата  ФУНТ СТЕРЛИНГОВ  ДИРХАМ ОАЭ  ДОЛЛАР США    ЕВРО  \
0           0  2015-09-02           359.98       63.97      234.94  264.85   
1           1  2015-09-03           368.86       65.66      241.15  272.07   
2           2  2015-09-04           365.68       65.28      239.76  269.37   
3           3  2015-09-05           365.08       65.24      239.60  266.91   
4           4  2015-09-06           365.08       65.24      239.60  266.91   

   РОССИЙСКИЙ РУБЛЬ  ТУРЕЦКАЯ ЛИРА  Размер ставки, % Коридор базовой ставки, %  
0              3.60          80.40              12.0                7,0 - 17,0  
1              3.61          81.85              12.0                7,0 - 17,0  
2              3.58          81.07              12.0                7,0 - 17,0  
3              3.54          80.44              12.0                7,0 - 17,0  
4              3.54          80.44              12.0                7,0 - 17,0  


In [4]:
X = df[['ДОЛЛАР США', 'Размер ставки, %']] 
y = df['РОССИЙСКИЙ РУБЛЬ']

# Разделение данных на обучающую и тестовую выборку (80%/20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
scaler = RobustScaler()

X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

### Модель линейной регрессии

In [6]:
model = LinearRegression()

gs = GridSearchCV(model, param_grid={'fit_intercept': [True, False], 'positive': [True, False],},
                  refit='MAE', cv=5, n_jobs=-1)

gs.fit(X_train_norm, y_train)

best_fit_intercept = gs.best_params_['fit_intercept']
best_positive = gs.best_params_['positive']

print(best_fit_intercept)
print(best_positive)

True
True


In [7]:
model = LinearRegression(fit_intercept=best_fit_intercept,
                               positive=best_positive)

model.fit(X_train_norm, y_train)

# Прогнозирование на обучающей выборке
y_train_pred = model.predict(X_train_norm)

# Прогнозирование на тестовой выборке
y_test_pred = model.predict(X_test_norm)

# Вычисление метрик
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Вычисление метрик на тестовой выборке
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Train MSE:", train_mse)
print("Train MAE:", train_mae)
print("Train R^2:", train_r2)

print("Test MSE:", test_mse)
print("Test MAE:", test_mae)
print("Test R^2:", test_r2)

Train MSE: 0.3251043838408847
Train MAE: 0.41414895584015027
Train R^2: 0.4027117406750991
Test MSE: 0.30767088947732907
Test MAE: 0.39955126962581494
Test R^2: 0.42019837682236594


### Модель случайного леса

GridSearch не удалось подобрать параметры. Ждал 15 минут:
ValueError: n_estimators must be greater than zero, got 0.

In [33]:
# Обучение модели случайного леса с дефолтными параметрами
model = RandomForestRegressor()
model.fit(X_train_norm, y_train)

# Прогнозирование на обучающей выборке
y_train_pred = model.predict(X_train_norm)

# Прогнозирование на тестовой выборке
y_test_pred = model.predict(X_test_norm)

# Вычисление метрик
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Вычисление метрик на тестовой выборке
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Train MSE:", train_mse)
print("Train MAE:", train_mae)
print("Train R^2:", train_r2)

print("Test MSE:", test_mse)
print("Test MAE:", test_mae)
print("Test R^2:", test_r2)

Train MSE: 0.0038569778580550174
Train MAE: 0.02788775867795047
Train R^2: 0.9929138833384054
Test MSE: 0.017733181617827415
Test MAE: 0.06958030320262991
Test R^2: 0.9665820594740481


### Модель KNN

In [8]:
model = KNeighborsRegressor()

gs = GridSearchCV(model, param_grid={'n_neighbors': np.arange(5, 50, 5), 'p': np.arange(1, 3), 'metric': ['minkowski', 'euclidean', 'cosine', 'cityblock']},
                  refit='MAE', cv=5)

gs.fit(X_train_norm, y_train)

best_metric_knn = gs.best_params_['metric']
best_n = gs.best_params_['n_neighbors']
best_p = gs.best_params_['p']

best_knn = KNeighborsRegressor(metric=best_metric_knn, n_neighbors=best_n, p=best_p)

best_knn.fit(X_train, y_train)

print(best_metric_knn)
print(best_n)
print(best_p)

minkowski
5
1


In [9]:
# Обучение модели KNN
model = KNeighborsRegressor(n_neighbors=best_n, p=best_p, metric = best_metric_knn)
model.fit(X_train_norm, y_train)

# Прогнозирование на обучающей выборке
y_train_pred = model.predict(X_train_norm)

# Прогнозирование на тестовой выборке
y_test_pred = model.predict(X_test_norm)

# Вычисление метрик
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Вычисление метрик на тестовой выборке
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Train MSE:", train_mse)
print("Train MAE:", train_mae)
print("Train R^2:", train_r2)

print("Test MSE:", test_mse)
print("Test MAE:", test_mae)
print("Test R^2:", test_r2)

Train MSE: 0.015162888691006233
Train MAE: 0.06488156723063224
Train R^2: 0.9721424384205753
Test MSE: 0.026140384341637013
Test MAE: 0.09093238434163702
Test R^2: 0.9507387998340833


### Модель Lasso

In [29]:
model = Lasso(random_state=42)

gs = GridSearchCV(model,
                  param_grid={'alpha': np.concatenate((0.1**np.arange(1, 6), 10**np.arange(6))), 'tol': 0.1**np.arange(10), 'positive': [True, False], 'fit_intercept': [True, False], 'selection': ['cyclic', 'random']},
                  refit='MAE',
                  cv=5)

gs.fit(X_train_norm, y_train)

best_alpha_lasso = gs.best_params_['alpha']
best_fit_intercept_lasso = gs.best_params_['fit_intercept']
best_tol_lasso = gs.best_params_['tol']
best_positive_lasso = gs.best_params_['positive']
best_selection_lasso = gs.best_params_['selection']

In [30]:
model = Lasso(alpha=best_alpha_lasso,
                   fit_intercept=best_fit_intercept_lasso,
                   tol=best_tol_lasso,
                   positive=best_positive_lasso,
                   selection=best_selection_lasso, random_state=42)

model.fit(X_train_norm, y_train)

# Прогнозирование на обучающей выборке
y_train_pred = model.predict(X_train_norm)

# Прогнозирование на тестовой выборке
y_test_pred = model.predict(X_test_norm)

# Вычисление метрик
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Вычисление метрик на тестовой выборке
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Train MSE:", train_mse)
print("Train MAE:", train_mae)
print("Train R^2:", train_r2)

print("Test MSE:", test_mse)
print("Test MAE:", test_mae)
print("Test R^2:", test_r2)

Train MSE: 0.3251043843373472
Train MAE: 0.4141438157228486
Train R^2: 0.40271173976298835
Test MSE: 0.3076718473809393
Test MAE: 0.39954665429466785
Test R^2: 0.42019657166598934


### Модель Ridge

GridSearch не удалось подобрать параметры

In [25]:
# Обучение модели Ridge регрессии
model = Ridge(alpha = 2.0)
model.fit(X_train_norm, y_train)

# Прогнозирование на обучающей выборке
y_train_pred = model.predict(X_train_norm)

# Прогнозирование на тестовой выборке
y_test_pred = model.predict(X_test_norm)

# Вычисление метрик
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Вычисление метрик на тестовой выборке
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Train MSE:", train_mse)
print("Train MAE:", train_mae)
print("Train R^2:", train_r2)

print("Test MSE:", test_mse)
print("Test MAE:", test_mae)
print("Test R^2:", test_r2)

Train MSE: 0.32510591318655724
Train MAE: 0.4139065595843219
Train R^2: 0.40270893093072146
Test MSE: 0.30773089210690685
Test MAE: 0.3993532878598221
Test R^2: 0.42008530267978716


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


*после обновления ошибка не исчезла

### Модель ElasticNet

GridSearch не удалось подобрать параметры

In [43]:
# Обучение модели ElasticNet регрессии
model = ElasticNet()
model.fit(X_train, y_train)

# Прогнозирование на обучающей выборке
y_train_pred = model.predict(X_train)

# Прогнозирование на тестовой выборке
y_test_pred = model.predict(X_test)

train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Вычисление метрик на тестовой выборке
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Train MSE:", train_mse)
print("Train MAE:", train_mae)
print("Train R^2:", train_r2)

print("Test MSE:", test_mse)
print("Test MAE:", test_mae)
print("Test R^2:", test_r2)

Train MSE: 0.32539921164218316
Train MAE: 0.4109122851760837
Train R^2: 0.40217007715719233
Test MSE: 0.30853470624334767
Test MAE: 0.3964939527586589
Test R^2: 0.41857052582899945


### Модель Решающее дерево

GridSearch не удалось подобрать параметры

In [26]:
# Обучение модели решающего дерева
model = DecisionTreeRegressor(criterion='squared_error')
model.fit(X_train_norm, y_train)

# Прогнозирование на обучающей выборке
y_train_pred = model.predict(X_train_norm)

# Прогнозирование на тестовой выборке
y_test_pred = model.predict(X_test_norm)

train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Вычисление метрик на тестовой выборке
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Train MSE:", train_mse)
print("Train MAE:", train_mae)
print("Train R^2:", train_r2)

print("Test MSE:", test_mse)
print("Test MAE:", test_mae)
print("Test R^2:", test_r2)

Train MSE: 0.0006372311622779122
Train MAE: 0.003988614680066186
Train R^2: 0.9988292661968814
Test MSE: 0.022269820717432894
Test MAE: 0.06475533807829184
Test R^2: 0.9580328245490579


### Модель CatBoostRegressor

In [28]:
# Обучение модели CatBoostRegressor
model = CatBoostRegressor()
model.fit(X_train_norm, y_train)
       
# Прогнозирование на обучающей выборке
y_train_pred = model.predict(X_train_norm)

# Прогнозирование на тестовой выборке
y_test_pred = model.predict(X_test_norm)

train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Вычисление метрик на тестовой выборке
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Train MSE:", train_mse)
print("Train MAE:", train_mae)
print("Train R^2:", train_r2)

print("Test MSE:", test_mse)
print("Test MAE:", test_mae)
print("Test R^2:", test_r2)

Train MSE: 0.01537685768021132
Train MAE: 0.0772385690250984
Train R^2: 0.9717493303252556
Test MSE: 0.02532162922114596
Test MAE: 0.09244486993921751
Test R^2: 0.9522817327669069


#### Выводы

Наиулучшие результаты у моделей: Решающее дерево, случайный лес, KNN и CatBoost.

По их качеству лучший - случайный лес: он имеет наименьшие MSE, MAE и наибольший R^2.
Однако он занимает очень много времени. Подбор гиперпараметров для случайного леса занял более 15 минут и результат - ошибка.

Поэтому лучшие модели для нашей задачи это: Решающее дерево, KNN и CatBoost.