In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time


#from sklearn.datasets import load_boston
from sklearn.datasets import load_diabetes
from sklearn.datasets import fetch_california_housing


from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV, RidgeCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures, OneHotEncoder, LabelEncoder, MaxAbsScaler
from sklearn.pipeline import Pipeline

#classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [67]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset = fetch_california_housing()
X = pd.DataFrame(dataset.data)
X.columns = dataset.feature_names
y = dataset.target

In [3]:
RANDOM_STATE = 42

In [4]:
# dataset = load_diabetes()
# X = pd.DataFrame(dataset.data)
# X.columns = dataset.feature_names
# y = dataset.target

In [5]:
X.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')

In [6]:
X.drop(columns = ['Latitude', 'Longitude'], inplace = True)

In [7]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467


1. Разделите выборку на обучающую и тестовую в отношении 80%/20%

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = RANDOM_STATE)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16512, 6), (4128, 6), (16512,), (4128,))

2. Обучите стандартную регрессию, а также Ridge и  Lasso и параметрами по умолчанию и выведите их R2 на тестовой выборке

In [10]:
models  = [LinearRegression(), Ridge(), Lasso()]

scores = {}
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    scores[str(model)] = r2
    print(f"Model: {model}\nMSE: {mse:.2f}\nR2: {r2:.2f}")

Model: LinearRegression()
MSE: 0.64
R2: 0.51
Model: Ridge()
MSE: 0.64
R2: 0.51
Model: Lasso()
MSE: 0.94
R2: 0.28


In [11]:
scores

{'LinearRegression()': 0.5099337366296416,
 'Ridge()': 0.510019673206934,
 'Lasso()': 0.2841671821008396}

3. Для Ridge и Lasso подберите коэффициент регуляризации(используйте GridSearchCV, RidgeCV, LassoCV) в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по лучшим моделям и сравните с предыдущими результатами. Напишите как изменился результат

##### Ridge GridSearchCV

In [12]:
params = {'alpha': [10 ** level for level in range(-5, 6)]}
params

{'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}

In [13]:
ridge = Ridge()
ridge_gcv = GridSearchCV(ridge, params)
ridge_gcv.fit(X_train, y_train)
ridge_gcv.best_params_

{'alpha': 10}

In [14]:
ridge_gcv = ridge_gcv.best_estimator_
ridge_gcv.fit(X_train, y_train)
r2_ridge_gcv = r2_score(y_test, ridge_gcv.predict(X_test))
print(f'R2: {r2_ridge_gcv}')

R2: 0.5107565001975987


##### Ridge CV

In [15]:
ridge_cv = RidgeCV(alphas= params['alpha'])
ridge_cv.fit(X_train, y_train)
ridge_cv.alpha_

10.0

In [16]:
ridge_cv = Ridge(alpha = ridge_cv.alpha_)
ridge_cv.fit(X_train, y_train)
r2_ridge_cv = r2_score(y_test, ridge_cv.predict(X_test))
print(f'R2: {r2_ridge_cv}')

R2: 0.5107565001975987


##### Lasso GridSearchCV

In [17]:
lasso = Lasso()
lasso_gcv = GridSearchCV(lasso, params)
lasso_gcv.fit(X_train, y_train)
lasso_gcv.best_params_

{'alpha': 0.001}

In [18]:
lasso_gcv = lasso_gcv.best_estimator_
lasso_gcv.fit(X_train, y_train)
r2_lasso_gcv = r2_score(y_test, lasso_gcv.predict(X_test))
print(f'R2: {r2_lasso_gcv}')


R2: 0.511292888365978


##### Lasso CV

In [19]:
lasso_cv = LassoCV(alphas = params['alpha'])
lasso_cv.fit(X_train, y_train)
lasso_cv.alpha_

0.001

In [20]:
lasso_cv = Lasso(alpha= lasso_cv.alpha_)
lasso_cv.fit(X_train, y_train)
r2_lasso_cv = r2_score(y_test, lasso_cv.predict(X_test))
print(f'R2: {r2_lasso_cv}')

R2: 0.511292888365978


In [21]:
np.array([[scores['Ridge()'], r2_ridge_gcv, r2_ridge_cv],
                                       [scores['Lasso()'], r2_lasso_gcv, r2_lasso_cv]])

array([[0.51001967, 0.5107565 , 0.5107565 ],
       [0.28416718, 0.51129289, 0.51129289]])

In [22]:
results = pd.DataFrame(data = np.array([[scores['Ridge()'], r2_ridge_gcv, r2_ridge_cv],
                                       [scores['Lasso()'], r2_lasso_gcv, r2_lasso_cv]]),
                        index= ['Ridge', 'Lasso'],
                        columns= ['Original', 'GridSearchCV', 'CV'])

In [23]:
results

Unnamed: 0,Original,GridSearchCV,CV
Ridge,0.51002,0.510757,0.510757
Lasso,0.284167,0.511293,0.511293


4. Проведите масштабирование выборки(используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

##### Lasso

In [24]:
lasso_pipe = Pipeline(steps = [
    ('scaling', MinMaxScaler()),
    ('model', Lasso())
])

lasso_pipe.fit(X_train, y_train)
r2_lasso_pipe_mms = r2_score(y_test, lasso_pipe.predict(X_test))
print(f"R2: {r2_lasso_pipe_mms}")

R2: -0.00021908714592466794


In [25]:
lasso_pipe_ss = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('model', Lasso())]
)
lasso_pipe_ss.fit(X_train, y_train)
r2_lasso_pipe_ss = r2_score(y_test, lasso_pipe_ss.predict(X_test))
print(f'R2: {r2_lasso_pipe_ss}')


R2: -0.00021908714592466794


##### Ridge

In [26]:
ridge_pipe = Pipeline(steps = [
    ('scaler', MinMaxScaler()),
    ('model', Ridge())
])
ridge_pipe.fit(X_train, y_train)
r2_ridge_pipe = r2_score(y_test, ridge_pipe.predict(X_test))
print(f'R2: {r2_ridge_pipe}')

R2: 0.5087674245872261


In [27]:
ridge_pipe_ss = Pipeline(steps = [
    ('scale', StandardScaler()),
    ('model', Ridge())
])

ridge_pipe_ss.fit(X_train, y_train)
r2_ridge_pipe_ss = r2_score(y_test, ridge_pipe_ss.predict(X_test))
print(f'R2: {r2_ridge_pipe_ss}')

R2: 0.5099698613970367


Для обеих моделей результат улучшился по сравнению с обучением без скейлинга параметров

5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

##### Lasso

In [28]:
lassocv_pipe = Pipeline(steps = [
    ('scaler', MinMaxScaler()),
    ('model', LassoCV(alphas = params['alpha']))
])
lassocv_pipe.fit(X_train, y_train)
r2_lassocv_pipe = r2_score(y_test, lassocv_pipe.predict(X_test))
print(f'R2: {r2_lassocv_pipe}')

R2: 0.5104079606412832


In [29]:
lassocv_pipe_ss = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('model', LassoCV(alphas=params['alpha']))
])
lassocv_pipe_ss.fit(X_train, y_train)
r2_lassocv_pipe_ss = r2_score(y_test, lassocv_pipe_ss.predict(X_test))
print(f'R2: {r2_lassocv_pipe_ss}')


R2: 0.5109187671982021


##### Ridge

In [30]:
ridgecv_pipe = Pipeline(steps = [
    ('scaler', MinMaxScaler()),
    ('model', RidgeCV(alphas = params['alpha']))
])

ridgecv_pipe.fit(X_train, y_train)
r2_ridgecv_pipe = r2_score(y_test, ridgecv_pipe.predict(X_test))
print(f'R2: {r2_ridgecv_pipe}')

R2: 0.5107423426750731


In [31]:
ridgecv_pipe_ss = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('model', RidgeCV(alphas = params['alpha']))
])
ridgecv_pipe_ss.fit(X_train, y_train)
r2_ridgecv_pipe_ss = r2_score(y_test, ridgecv_pipe_ss.predict(X_test))
print(f'R2: {r2_ridgecv_pipe_ss}')

R2: 0.5102872116247683


6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

##### Lasso

In [32]:
lasso_poly = Pipeline(steps = [
    ('scaler', MinMaxScaler()),
    ('preprocessor', PolynomialFeatures()),
    ('model', Lasso())   
])
lasso_poly.fit(X_train, y_train)
r2_lasso_poly = r2_score(y_test, lasso_poly.predict(X_test))
print(f'R2: {r2_lasso_poly}')

R2: -0.00021908714592466794


In [33]:
lasso_poly_ss = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('preprocessor', PolynomialFeatures()),
    ('model', Lasso())
])
lasso_poly_ss.fit(X_train, y_train)
r2_lasso_poly_ss = r2_score(y_test, lasso_poly_ss.predict(X_test))
print(f'R2: {r2_lasso_poly_ss}')

R2: 0.03213255153849037


##### Ridge

In [34]:
ridge_poly = Pipeline(steps = [
    ('scaler', MinMaxScaler()),
    ('preprocessor', PolynomialFeatures()),
    ('model', Ridge())
])
ridge_poly.fit(X_train, y_train)
r2_ridge_poly = r2_score(y_test, ridge_poly.predict(X_test))
print(f'R2: {r2_ridge_poly}')

R2: 0.5146641340135979


In [35]:
ridge_poly_ss = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('preprocessor', PolynomialFeatures()),
    ('model', Ridge())
])
ridge_poly_ss.fit(X_train, y_train)
r2_ridge_poly_ss = r2_score(y_test, ridge_poly_ss.predict(X_test))
print(f'R2: {r2_ridge_poly_ss}')

R2: 0.5736702844678224


Полином улучшает модель особенно заметно для ridge

7. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2. Напишите как изменился R2 по сравнению с предыдущими экспериментами

In [68]:
final_pipe = Pipeline(steps = [
    ('scaler', MinMaxScaler()),
    ('preprocessor', PolynomialFeatures()),
    ('model', Lasso())
])

param_grid = {
    'scaler': [MinMaxScaler(), StandardScaler()],
    'preprocessor__degree': [2, 3, 4],
    'model': [Lasso(), Ridge()],
    'model__alpha': [10 ** level for level in range(-5, 6)]
}

grid_search = GridSearchCV(final_pipe, param_grid, cv = 5)
grid_search.fit(X_train, y_train)


ValueError: 
All the 660 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
330 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 142, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 862, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/preprocessing/_data.py", line 427, in fit
    return self.partial_fit(X, y)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/preprocessing/_data.py", line 466, in partial_fit
    X = self._validate_data(
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 546, in _validate_data
    X = check_array(X, input_name="X", **check_params)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 2064, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'Private'

--------------------------------------------------------------------------------
330 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 142, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 862, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/preprocessing/_data.py", line 824, in fit
    return self.partial_fit(X, y, sample_weight)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/preprocessing/_data.py", line 861, in partial_fit
    X = self._validate_data(
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 546, in _validate_data
    X = check_array(X, input_name="X", **check_params)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/yurititov/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 2064, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'Private'


In [37]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'model': Lasso(alpha=0.01), 'model__alpha': 0.01, 'preprocessor__degree': 3, 'scaler': StandardScaler()}
0.5765927067516305


In [38]:
pipe_tuned = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('preprocessor', PolynomialFeatures(degree=3)),
    ('model', Lasso(alpha = 0.01))
])
pipe_tuned.fit(X_train, y_train)
r2_pipe_tuned = r2_score(y_test, pipe_tuned.predict(X_test))
print(f'R2: {r2_pipe_tuned}')

R2: 0.16751805961152688


  model = cd_fast.enet_coordinate_descent(


http://archive.ics.uci.edu/ml/datasets/Adult

In [39]:
link = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
data = pd.read_csv(link, header=None)
#data = pd.read_csv(link)

In [40]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
48838,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


8. Разделите выборку на признаки и целевую переменную(колонка со зачениями {<=50K,>50K}). Замените целевую переменную на числовые значения.

In [41]:
X = data.drop(columns=14)
y = data[14]

In [42]:
y = y.map({'<=50K': 0, '>50K': 1})

9. Выясните, присутствуют ли в данных пропуски. Заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [43]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       48842 non-null  int64 
 1   1       48842 non-null  object
 2   2       48842 non-null  int64 
 3   3       48842 non-null  object
 4   4       48842 non-null  int64 
 5   5       48842 non-null  object
 6   6       48842 non-null  object
 7   7       48842 non-null  object
 8   8       48842 non-null  object
 9   9       48842 non-null  object
 10  10      48842 non-null  int64 
 11  11      48842 non-null  int64 
 12  12      48842 non-null  int64 
 13  13      48842 non-null  object
dtypes: int64(6), object(8)
memory usage: 5.2+ MB


In [44]:
X.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64

Пропущенных значений в численных признаках нет. Посмотрим какие значения у нас в категориальных признаках

10. Выберите колонки с числовыми и категориальными переменными.

In [45]:
num_cols = X.select_dtypes(include='int64').columns.to_list()
cat_cols = X.select_dtypes(include = 'object').columns.to_list()

11. Создайте пайплайн по обработке колонок(используйте OneHotEncoder,MinMaxScaler).

In [46]:
num_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('scaler', MinMaxScaler())
    ])

cat_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown="ignore"))
    ])

preprocessor = ColumnTransformer(
    transformers= [
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

12. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [47]:
y.value_counts()

0    37155
1    11687
Name: 14, dtype: int64

The most frequent class is '0'

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= RANDOM_STATE)

In [49]:
classifiers = [
    LogisticRegression(max_iter = 1000),
    SVC(),
    LinearSVC()
]

for classifier in classifiers:
    pipe = Pipeline(steps= [
        ('preprocessor', preprocessor),
        ('model', classifier)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    y_pred_mf = []

    for i, p in enumerate(y_pred):
        if y_test.iloc[i] == 0:
            y_pred_mf.append(p)

    y_test_mf = [0] * len(y_pred_mf)
    print()

    accuracy = accuracy_score(y_test, y_pred)
    mf_accuracy = accuracy_score(y_test_mf, y_pred_mf)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    mf_precision = precision_score(y_test_mf, y_pred_mf)
    recall = recall_score(y_test, y_pred)
    mf_recall = recall_score(y_test_mf, y_pred_mf)


    mf_f1 = f1_score(y_test_mf, y_pred_mf)
    print(f'Model: {classifier}')
    print(f'F1 score: {f1 = :.3f}')
    print(f'Accuracy: {accuracy = :.3f}')
    print(f'Precision: {precision = :.3f}')
    print(f'Recall: {recall = :.3f}')
    
    print(f'F1 score for most frequent class: {mf_f1 = :.3f}')
    print(f'Accuracy for most frequent class: {mf_accuracy = :.3f}')
    print(f'Precision for most frequent class: {mf_precision = :.3f}')
    print(f'Recall for most frequent class: {mf_recall = :.3f}')
    print()


Model: LogisticRegression(max_iter=1000)
F1 score: f1 = 0.648
Accuracy: accuracy = 0.849
Precision: precision = 0.737
Recall: recall = 0.578
F1 score for most frequent class: mf_f1 = 0.000
Accuracy for most frequent class: mf_accuracy = 0.935
Precision for most frequent class: mf_precision = 0.000
Recall for most frequent class: mf_recall = 0.000



  _warn_prf(average, modifier, msg_start, len(result))



Model: SVC()
F1 score: f1 = 0.614
Accuracy: accuracy = 0.839
Precision: precision = 0.729
Recall: recall = 0.531
F1 score for most frequent class: mf_f1 = 0.000
Accuracy for most frequent class: mf_accuracy = 0.937
Precision for most frequent class: mf_precision = 0.000
Recall for most frequent class: mf_recall = 0.000



  _warn_prf(average, modifier, msg_start, len(result))



Model: LinearSVC()
F1 score: f1 = 0.651
Accuracy: accuracy = 0.851
Precision: precision = 0.746
Recall: recall = 0.577
F1 score for most frequent class: mf_f1 = 0.000
Accuracy for most frequent class: mf_accuracy = 0.938
Precision for most frequent class: mf_precision = 0.000
Recall for most frequent class: mf_recall = 0.000



  _warn_prf(average, modifier, msg_start, len(result))


13. Посчитайте cross_val_score по алгоритмам LogisticRegression, SVC, LinearSVC по метрикам accuracy и f1_score.
Напишите удалось ли превзойти предыдущий результат.

In [50]:
def crossval(model, X, y, cv = 5):
    accuracy = cross_val_score(model, X, y, scoring= 'accuracy', cv = cv).mean()
    f1 = cross_val_score(model, X, y, scoring= 'f1', cv = cv).mean()
    print(f'F1 score: {f1 = :.3f}')
    print(f'Accuracy: {accuracy = :.3f}')

In [51]:
for classifier in classifiers:
    print(f'Model: {classifier}')
    pipe = Pipeline(steps= [
            ('preprocessor', preprocessor),
            ('model', classifier)
        ])
    crossval(pipe, X, y)

Model: LogisticRegression(max_iter=1000)
F1 score: f1 = 0.656
Accuracy: accuracy = 0.851
Model: SVC()
F1 score: f1 = 0.620
Accuracy: accuracy = 0.840
Model: LinearSVC()
F1 score: f1 = 0.658
Accuracy: accuracy = 0.853


14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями (испольуйте SimpleImputer)

In [52]:
X.replace('?', np.NaN, inplace=True)
X.isna().sum()

0        0
1     2799
2        0
3        0
4        0
5        0
6     2809
7        0
8        0
9        0
10       0
11       0
12       0
13     857
dtype: int64

15. Посчитайте cross_val_score на новых данных. Напишите удалось ли улучшить результат.

In [53]:
for classifier in classifiers:
    print(f'Model: {classifier}')
    pipe = Pipeline(steps= [
            ('preprocessor', preprocessor),
            ('model', classifier)
        ])
    crossval(pipe, X, y)

Model: LogisticRegression(max_iter=1000)
F1 score: f1 = 0.654
Accuracy: accuracy = 0.851
Model: SVC()
F1 score: f1 = 0.617
Accuracy: accuracy = 0.840
Model: LinearSVC()
F1 score: f1 = 0.652
Accuracy: accuracy = 0.851


Результат стал немного хуже

16. Посчитайте cross_val_score, если просто удалить значения '?'. Напишите как изменился результат

In [54]:
new_data = data.replace("?", np.NaN)

In [55]:
new_data.dropna(inplace= True)

In [56]:
new_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [57]:
X_new = new_data.drop(columns=[14])
y_new = new_data[14]

In [58]:
y_new = y_new.map({'<=50K': 0, '>50K': 1})

In [59]:
for classifier in classifiers:
    print(f'Model: {classifier}')
    pipe = Pipeline(steps= [
            ('preprocessor', preprocessor),
            ('model', classifier)
        ])
    crossval(pipe, X_new, y_new)

Model: LogisticRegression(max_iter=1000)
F1 score: f1 = 0.660
Accuracy: accuracy = 0.847
Model: SVC()
F1 score: f1 = 0.627
Accuracy: accuracy = 0.836
Model: LinearSVC()
F1 score: f1 = 0.662
Accuracy: accuracy = 0.849


Точность модели улучшилась, но при этом f1_score уменьшился

 17. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier. Напишите как изменился результат и какой вывод можно из этого сделать.

In [60]:
new_classifiers = [RandomForestClassifier(), GradientBoostingClassifier()]

In [61]:
for classifier in new_classifiers:
    print(f'Model: {classifier}')
    pipe = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('model', classifier)
    ])
    crossval(pipe, X_new, y_new)

Model: RandomForestClassifier()
F1 score: f1 = 0.671
Accuracy: accuracy = 0.849
Model: GradientBoostingClassifier()
F1 score: f1 = 0.687
Accuracy: accuracy = 0.863


18. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [62]:
num_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('scaler', MinMaxScaler())
])

cat_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers= [
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)
final_pipe = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])

In [63]:
scores = {'accuracy': 'accuracy', 'F1 score': 'f1'}
param_grid = {
    'preprocessor__num__imputer__strategy': ['most_frequent', 'mean', 'median'],
    'preprocessor__num__scaler': [MinMaxScaler(), StandardScaler(with_mean=False), MaxAbsScaler()],
    'preprocessor__cat__encoder': [OneHotEncoder(handle_unknown="ignore")],#, LabelEncoder()],
    'model': [LogisticRegression(max_iter = 1000), SVC(), LinearSVC(), RandomForestClassifier(), GradientBoostingClassifier()]
}

In [64]:
grid_search = GridSearchCV(final_pipe, param_grid, scoring=scores, refit= 'F1 score', cv=5)
#grid_search = GridSearchCV(final_pipe, param_grid, scoring='accuracy', refit= False, cv=5)
grid_search.fit(X, y)
print("Best params: ", grid_search.best_params_)
print("Best scores: ", grid_search.best_score_)



Best params:  {'model': GradientBoostingClassifier(), 'preprocessor__cat__encoder': OneHotEncoder(handle_unknown='ignore'), 'preprocessor__num__imputer__strategy': 'most_frequent', 'preprocessor__num__scaler': MinMaxScaler()}
Best scores:  0.683188275892817


In [65]:
num_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('scaler', MinMaxScaler())
])

cat_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers= [
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)
final_pipe = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', GradientBoostingClassifier())
])

In [66]:
final_pipe.fit(X_train, y_train)
r2 = r2_score(y_test, final_pipe.predict(X_test))
accuracy = accuracy_score(y_test, final_pipe.predict(X_test))
f1 = f1_score(y_test, final_pipe.predict(X_test))

print(f'R2: {r2 = :.3f}')
print(f'Accuracy: {accuracy = :.3f}')
print(f'F1 score: {f1= :.3f}')

R2: r2 = 0.272
Accuracy: accuracy = 0.867
F1 score: f1= 0.683
