In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn import metrics

In [73]:
data = pd.read_csv("/content/imports-85.data", header=None, na_values="?")
train, test = train_test_split(data, test_size=0.1)

In [74]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [75]:
data.loc[:, data.dtypes == "float64"] = data.loc[:, data.dtypes == "float64"].fillna(
    data.loc[:, data.dtypes == "float64"].mean(axis=0)
)
data.loc[:, data.dtypes == "object"] = data.loc[:, data.dtypes == "object"].fillna("")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [76]:
data_encoded = pd.get_dummies(data)

In [77]:
data_encoded.columns = data_encoded.columns.astype(str)
X = data_encoded.drop(columns=["25"])
y = data_encoded["25"]

In [79]:
feature_names = X.columns.tolist()

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3)

In [81]:
scaler = RobustScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [82]:
class MyLinearRegression:
    def __init__(self, lr=0.001, num_iter=10000) -> None:
        self.lr = lr
        self.num_iter = num_iter

    def calculate_loss(self, y, y_pred):
        return ((y - y_pred) ** 2).mean()

    def calculate_dldw(self, y, y_pred, x):
        grad_w = (-2 * (x.T * (y - y_pred)).T).mean(axis=0)
        return grad_w

    def calculate_dldb(self, y, y_pred):
        grad_b = (-2 * (y - y_pred)).mean()
        return grad_b

    def fit(self, X_train, y_train):
        X_train = np.array(X_train)
        y_train = np.array(y_train)
        self.weights = np.random.rand(X_train.shape[1])
        self.b = 0
        for i in range(self.num_iter):
            pred = (X_train @ self.weights) + self.b
            self.weights -= self.lr * self.calculate_dldw(y_train, pred, X_train)
            self.b -= self.lr * self.calculate_dldb(y_train, pred)

    def predict(self, X_test):
        return X_test @ self.weights + self.b

In [83]:
my_model = MyLinearRegression()
my_model.fit(X_train, y_train)
preds = my_model.predict(X_test)
print(
    f"""
    MSE_test = {metrics.mean_squared_error(preds, y_test)},
    MAE_test = {metrics.mean_absolute_error(preds, y_test)}
    r2_test = {metrics.r2_score(y_test, preds)}
"""
)


    MSE_test = 3083536.7535960926,
    MAE_test = 1449.9247957071414
    r2_test = 0.9611384617813625



In [84]:
coefficients = my_model.weights

In [85]:
importance = pd.DataFrame({
    'feature': feature_names,
    'coefficient': model.weights,
    'abs_effect': np.abs(model.weights)
})

In [86]:
importance = importance.sort_values('abs_effect', ascending=False)

In [87]:
print("Самые важные признаки:")
print(importance.head(10))

Самые важные признаки:
            feature  coefficient   abs_effect
7                16  4398.306617  4398.306617
17            2_bmw  2846.452241  2846.452241
24  2_mercedes-benz  2412.468712  2412.468712
73          17_mpfi  2340.446294  2340.446294
63          15_four -2264.623001  2264.623001
44    6_convertible  2136.030253  2136.030253
47          6_sedan  2101.193077  2101.193077
53           8_rear  2098.380203  2098.380203
52          8_front -2097.453348  2097.453348
69          17_2bbl  2038.594319  2038.594319
