# Predprocesiranje podata

In [32]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [19]:
df = pd.read_csv('auto-mpg.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model-year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70


In [20]:
df = df.replace('?', np.nan)
df = df.dropna()

In [21]:
X = df.drop('mpg', axis=1)
y = df[['mpg']]

In [22]:
X.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model-year
0,8,307.0,130.0,3504,12.0,70
1,8,350.0,165.0,3693,11.5,70
2,8,318.0,150.0,3436,11.0,70
3,8,304.0,150.0,3433,12.0,70
4,8,302.0,140.0,3449,10.5,70


In [23]:
y.head()

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Treniranje modela

In [27]:
regression = LinearRegression()
regression.fit(X_train, y_train)

LinearRegression()

In [28]:
regression.intercept_[0] # slobodni clan

-13.824169113397183

In [30]:
for idx, col_name in enumerate(X_train.columns):
    print(f'Coef za {col_name} = {regression.coef_[0][idx]}')

Coef za cylinders = -0.2381301557148637
Coef za displacement = 0.001683903382560267
Coef za horsepower = -0.0002501905818831098
Coef za weight = -0.00662156098602014
Coef za acceleration = 0.010273669827509827
Coef za model-year = 0.7625231149758142


# Evaluacija modela

In [31]:
r2_test = regression.score(X_test, y_test)
r2_train = regression.score(X_train, y_train)
print(f'Test: {r2_test}')
print(f'Train: {r2_train}')

Test: 0.7757849559316724
Train: 0.8134917530842111


In [37]:
mse_test = mean_squared_error(y_test, regression.predict(X_test))
mse_train = mean_squared_error(y_train, regression.predict(X_train))

print(f'MSE test: {mse_test}')
print(f'MSE train: {mse_train}')

print(f'RMSE test: {np.sqrt(mse_test)}')
print(f'RMSE train: {np.sqrt(mse_train)}')

MSE test: 11.054880997716916
MSE train: 12.04121219273428
RMSE test: 3.32488811807509
RMSE train: 3.4700449842522616


# Postprocesiranje modela

In [36]:
regression.predict([[8,318.0,150.0,3436,11.0,70]])

array([[15.5066872]])