In [None]:
import numpy as np
import pandas as pd

from sklearn import linear_model # линейные модели
from sklearn.model_selection import train_test_split

import seaborn as sns

## diamonds (Prices of over 50,000 round cut diamonds)

*number of observations* : 53940

- price price in US dollars ($326–$18,823)
- carat weight of the diamond (0.2–5.01)
- cut quality of the cut (Fair, Good, Very Good, Premium, Ideal) 
- color diamond colour, from D (best) to J (worst)
- clarity a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
- x length in mm (0–10.74)
- y width in mm (0–58.9)
- z depth in mm (0–31.8)
- depth total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43–79)
- table width of top of diamond relative to widest point (43–95)

In [None]:
df = pd.read_csv('diamonds.csv')
df.head()

In [None]:
df = pd.get_dummies(df, columns=['color', 'cut', 'clarity'], dtype=int)
df.head()

In [None]:
sns.scatterplot(data=df, x='carat', y='price')

In [None]:
sns.scatterplot(x= np.log(df['carat']), y=np.log(df['price']))

In [None]:
sns.heatmap(df.corr(), vmin=-1, vmax=1)

In [None]:
# Данные для прогноза
y = np.log(df['price'])

X = df.drop(columns=['price', 'carat'])
X.insert(loc=0, column='log(carat)', value=np.log(df['carat']))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Линейная регрессия (с регуляризацией)

In [None]:
# Линейная регрессия
lin_reg = linear_model.LinearRegression(fit_intercept=True)

lin_reg.fit(X_train, y_train)

lin_reg.score(X_test, y_test)

In [None]:
# Ridge регрессия
ridge_reg = linear_model.Ridge(alpha=0.5, fit_intercept=True)

ridge_reg.fit(X_train, y_train)

ridge_reg.score(X_test, y_test)

In [None]:
# Lasso регрессия
lasso_reg = linear_model.Lasso(alpha=0.2, fit_intercept=True)

lasso_reg.fit(X_train, y_train)

lasso_reg.score(X_test, y_test)

In [None]:
# Квантильная регрессия
quantile_reg = linear_model.QuantileRegressor(quantile=0.2, alpha=0.1,  fit_intercept=True, solver='highs-ipm')

quantile_reg.fit(X_train, y_train)

quantile_reg.score(X_test, y_test)