In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

np.set_printoptions(precision=3, suppress=True)

In [2]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.8.0


#### Cоздание моделей для прогнозирования топливной экономичности автомобилей конца 1970-х и начала 1980-х годов.

In [3]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)

In [4]:
raw_dataset.shape

(398, 8)

In [5]:
raw_dataset.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [None]:
df = raw_dataset.copy()
df.head()

In [None]:
df.isna()

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()

In [None]:
df.shape

In [None]:
df['Origin'] = df['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})

In [None]:
df

In [None]:
df = pd.get_dummies(df, columns=['Origin'], prefix='', prefix_sep='')
df.tail()

In [None]:
X = df.copy()
y = X.pop('MPG')

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=1)

In [None]:
df[['MPG', 'Cylinders', 'Displacement', 'Weight']]

In [None]:
sns.pairplot(df[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')

In [None]:
X.describe().round(2).T

Нормировка

In [None]:
X.describe().transpose()[['mean', 'std']]

In [None]:
normalizer = tf.keras.layers.Normalization(axis=-1)

In [None]:
normalizer.adapt(np.array(X))

In [None]:
print(normalizer.mean.numpy())

In [None]:
first = np.array(X[:1])

with np.printoptions(precision=2):
    print('Первый пример:', first)
    print()
    print('Нормализованный:', normalizer(first).numpy())

#### Построение линейной модели от лошадиных сил

In [None]:
horsepower = np.array(X_train['Horsepower'])

horsepower_normalizer = layers.Normalization(input_shape=[1,], axis=None)
horsepower_normalizer.adapt(np.array(X['Horsepower']))

In [None]:
X_train['Horsepower']

In [None]:
horsepower

In [None]:
horsepower_model = tf.keras.Sequential([
    horsepower_normalizer,
    layers.Dense(units=1)
])

horsepower_model.summary()

In [None]:
horsepower[:10]

In [None]:
horsepower_model.predict(horsepower[:10])

In [None]:
y_train.values[:10]

In [None]:
horsepower_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')

In [None]:
%%time
history = horsepower_model.fit(
    X_train['Horsepower'],
    y_train,
    epochs=100,
    verbose=1,
    validation_split = 0.2)

In [None]:
history.history

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.ylim([0, 10])
    plt.xlabel('Эпоха')
    plt.ylabel('MAE [MPG]')
    plt.legend()
    plt.grid()

In [None]:
plot_loss(history)

In [None]:
horsepower_model.evaluate(X_test['Horsepower'], y_test, verbose=0)

In [None]:
test_results = {}

test_results['horsepower_model'] = horsepower_model.evaluate(X_test['Horsepower'], 
                                                             y_test, verbose=0)

In [None]:
test_results

In [None]:
tf.linspace(0.0, 250, 251)

In [None]:
x = tf.linspace(0.0, 250, 251)
prediction = horsepower_model.predict(x)

In [None]:
def plot_horsepower(x_train, y_train, x, prediction):
    plt.scatter(x_train, y_train, label='Data')
    plt.plot(x, prediction, color='k', label='Predictions')
    plt.xlabel('T')
    plt.ylabel('MPG')
    plt.legend()

In [None]:
plot_horsepower(X['Horsepower'], y, x, prediction)

#### Построение линейной модели

In [None]:
linear_model = tf.keras.Sequential([
    normalizer,
    layers.Dense(units=1)
])

In [None]:
linear_model.predict(X_train[:10])

In [None]:
y_train[:10]

In [None]:
linear_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')

In [None]:
%%time
history = linear_model.fit(
    X_train,
    y_train,
    epochs=100,
    verbose=1,
    validation_split = 0.2)

In [None]:
plot_loss(history)

In [None]:
test_results['linear_model'] = linear_model.evaluate(X_test, y_test, verbose=0)

In [None]:
test_results

#### Построение многойлойного персептрона

In [None]:
def build_and_compile_model(norm):
    model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(1)
    ])

    model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model

In [None]:
dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

In [None]:
%%time
history = dnn_model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    verbose=0, epochs=300)

In [None]:
plot_loss(history)

In [None]:
test_results['dnn_model'] = dnn_model.evaluate(X_test, y_test, verbose=0)

In [None]:
test_results

In [None]:
pd.DataFrame(test_results, index=['Mean absolute error [MPG]']).T

In [None]:
test_predictions = dnn_model.predict(X_test).flatten()

a = plt.axes(aspect='equal')
plt.scatter(y_test, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
error = y_test - test_predictions
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [MPG]')
_ = plt.ylabel('Count')

### Эксперименты с моделями

In [None]:
def build_and_compile_model(norm):
    model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(1)
    ])

    model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model

dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

In [None]:
%%time
history = dnn_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    verbose=0, epochs=300)
plot_loss(history)
dnn_model.evaluate(X_test, y_test, verbose=0)

### Можно сделать больше эпох

In [None]:
import datetime

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
log_dir

In [None]:
dnn_model = build_and_compile_model(normalizer)
history = dnn_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    verbose=1, epochs=300, 
    callbacks=[tensorboard_callback])

plot_loss(history)
dnn_model.evaluate(X_test, y_test, verbose=0)

In [None]:
def build_and_compile_model(norm):
    model = keras.Sequential([
      norm,
      layers.Dense(128, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(1)
    ])

    model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model

dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

In [None]:
log_dir = "logs/fit/128-43-relu"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

dnn_model = build_and_compile_model(normalizer)
history = dnn_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    verbose=0, epochs=300, 
    callbacks=[tensorboard_callback])

plot_loss(history)
dnn_model.evaluate(X_test, y_test, verbose=0)

### Добавление dropout

In [None]:
dnn_model = keras.Sequential([
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.6),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.6),
    layers.Dense(1)
])

dnn_model.compile(loss='mean_squared_error',
            optimizer=tf.keras.optimizers.Adam(0.001))

log_dir = "logs/fit/128-do-64do"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

dnn_model = build_and_compile_model(normalizer)
history = dnn_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    verbose=0, epochs=300, 
    callbacks=[tensorboard_callback])

plot_loss(history)
dnn_model.evaluate(X_test, y_test, verbose=0)

In [None]:
dnn_model = keras.Sequential([
    layers.Dense(24, activation='relu'),
    layers.Dropout(0.8),
    layers.Dense(1)
])

dnn_model.compile(loss='mean_absolute_error',
            optimizer=tf.keras.optimizers.Adam(0.001))

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

dnn_model = build_and_compile_model(normalizer)
history = dnn_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    verbose=0, epochs=300, 
    callbacks=[tensorboard_callback])

plot_loss(history)
dnn_model.evaluate(X_test, y_test, verbose=0)

In [None]:
dnn_model = keras.Sequential([
    layers.Dense(24, activation='relu'),
    layers.Dropout(0.8),
    layers.Dense(1)
])

dnn_model.compile(loss='mean_absolute_error',
            optimizer=tf.keras.optimizers.RMSprop(0.01) )

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

dnn_model = build_and_compile_model(normalizer)
history = dnn_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    verbose=0, epochs=300, 
    callbacks=[tensorboard_callback])

plot_loss(history)
dnn_model.evaluate(X_test, y_test, verbose=0)

## Случайный лес

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
random_forest_tuning = RandomForestRegressor(random_state = 42)
param_grid = {
   'n_estimators': [100, 200, 500],
   'max_features': ['auto', 'sqrt', 'log2'],
   'max_depth' : [4,5,6],
   'criterion' :['squared_error']
}
GSCV = GridSearchCV(estimator=random_forest_tuning, param_grid=param_grid, cv=5, verbose=2)
GSCV.fit(X_train, y_train)
GSCV.best_params_ 

In [None]:
rf = GSCV.best_estimator_
rf

In [None]:
rf = RandomForestRegressor(GSCV.best_params_)
rf

In [None]:
rf = RandomForestRegressor(criterion='squared_error', max_depth=6, 
                           max_features='auto', n_estimators=500)

In [None]:
rf.fit(X_train, y_train)

In [None]:
test_predictions = rf.predict(X_test)

a = plt.axes(aspect='equal')
plt.scatter(y_test, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
rf.score(X_test, y_test)

In [None]:
rf.predict(X_test)

In [None]:
np.mean((y_test - np.mean(y_test))*(y_test - np.mean(y_test)))

In [None]:
prediction=rf.predict(X_test)

In [None]:
np.mean((y_test - prediction)*(y_test - prediction))

In [None]:
pd.DataFrame(GSCV.cv_results_)

In [None]:
plt.hist(pd.DataFrame(GSCV.cv_results_)['mean_test_score'])

### Лассо

In [None]:
lassso = Lasso(random_state = 42)
param_grid = {
   'alpha': np.linspace(0, 1, 100)
}
GSCV = GridSearchCV(estimator=lassso, param_grid=param_grid, cv=10, verbose=2)
GSCV.fit(X_train, y_train)
GSCV.best_params_ 

In [None]:
model=GSCV.best_estimator_

In [None]:
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
prediction=model.predict(X_test)
np.mean((y_test - prediction)*(y_test - prediction))

### k ближайших соседей

In [None]:
knn = KNeighborsRegressor()
param_grid = {
   'n_neighbors': [1, 2, 5, 10, 20]
}
GSCV = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10, verbose=2)
GSCV.fit(X_train, y_train)
GSCV.best_params_ 

In [None]:
knn.fit(X_train, y_train)
prediction=knn.predict(X_test)
np.mean((y_test - prediction)*(y_test - prediction))