In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.linear_model import SGDClassifier

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import sklearn as sk
import numpy as np

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/imdb.csv")
# df = df[:100]

df['Votes'] = df['Votes'].replace('No Votes', np.nan)
df['Rate'] = df['Rate'].replace('No Rate', np.nan)
df = df.dropna(subset=['Rate'])
# Votes записано как 1,000 -> преобразуем в 1000.0
df.Votes = df.Votes.str.replace(',', '', regex=False).astype(float)
df.Rate = df.Rate.astype(float)

print(df.columns)
print(df.Rate)


Index(['Name', 'Date', 'Rate', 'Votes', 'Genre', 'Duration', 'Type',
       'Certificate', 'Episodes', 'Nudity', 'Violence', 'Profanity', 'Alcohol',
       'Frightening'],
      dtype='object')
0       7.6
1       6.3
2       6.4
3       6.4
4       8.3
       ... 
6173    3.8
6174    8.3
6175    7.3
6176    6.0
6177    6.3
Name: Rate, Length: 5993, dtype: float64


In [None]:
print(df.nunique()[df.nunique() < 8])

Type           2
Nudity         4
Violence       4
Profanity      4
Alcohol        4
Frightening    4
dtype: int64


In [None]:
print(df.isnull().sum()[df.isnull().sum() > 0])

Duration        147
Certificate     300
Nudity         1459
Violence        674
Profanity       658
Alcohol         771
Frightening     858
dtype: int64


In [None]:
features_num = ['Duration', 'Votes']

# Выбираем категориальные столбцы с низкой кардинальностью
features_cat = [cname for cname in df.columns
                        if df[cname].nunique() < 8 and
                        df[cname].dtype == "object"]
print(features_cat)
print(features_num)

features = features_cat + features_num

X = df[features]
X_cat = df[features_cat]
X_num = df[features_num]


y = df.Rate
results = {}

['Type', 'Nudity', 'Violence', 'Profanity', 'Alcohol', 'Frightening']
['Duration', 'Votes']


In [None]:
# Функция проверки эффективности модели
def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

In [None]:
X_num = df[features_num]

X_num_train, X_num_test, y_num_train, y_num_test = train_test_split(X_num, y, train_size=0.8, test_size=0.2,
                                                      random_state=0, shuffle=True)

num_cols_with_missing = [col for col in X_num_train.columns
                     if X_num_train[col].isnull().any()]

reduced_X_train = X_num_train.drop(num_cols_with_missing, axis=1)
reduced_X_test = X_num_test.drop(num_cols_with_missing, axis=1)

MAE_num_delete = score_dataset(reduced_X_train, reduced_X_test, y_num_train, y_num_test)
print(f"MAE (Удаление столбцов):   {MAE_num_delete:.4f}")
results['(num) Удаление столбцов'] = MAE_num_delete

MAE (Удаление столбцов):   0.8276


In [None]:
# б) Меняем пропуски на наиболее частые значения

imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(X_num)

imputed_X_train = pd.DataFrame(imputer.transform(X_num_train))
imputed_X_test = pd.DataFrame(imputer.transform(X_num_test))

# Возвращаем названия исходных столбцов - тк меняются на 0, 1, 2...
imputed_X_train.columns = X_num_train.columns
imputed_X_test.columns = X_num_test.columns

MAE_num_insert = score_dataset(imputed_X_train, imputed_X_test, y_num_train, y_num_test)
print(f"MAE (Вставка):   {MAE_num_insert:.4f}")
results['(num) Вставка'] = MAE_num_insert

MAE (Вставка):   0.5967


In [None]:
# в) Меняем пропуски и + добавляем столбцы

X_train_extend = X_num_train.copy()
X_test_extend = X_num_test.copy()

# Создаем столбцы. Значение True если изначальное значение было пропущено
for col in num_cols_with_missing:
    X_train_extend[col + '_was_missing'] = X_train_extend[col].isnull()
    X_test_extend[col + '_was_missing'] = X_test_extend[col].isnull()

imputer = SimpleImputer(strategy="most_frequent")
imputed_X_train_extend = pd.DataFrame(imputer.fit_transform(X_train_extend))
imputed_X_test_extend = pd.DataFrame(imputer.transform(X_test_extend))

# Возвращаем названия исходных столбцов - тк меняются на 0, 1, 2...
imputed_X_train_extend.columns = X_train_extend.columns
imputed_X_test_extend.columns = X_test_extend.columns

MAE_num_insert_2 = score_dataset(imputed_X_train_extend, imputed_X_test_extend, y_num_train, y_num_test)
print(f"MAE (Расширенная вставка):   {MAE_num_insert_2:.4f}")
results['(num) Расширенная вставка'] = MAE_num_insert_2

MAE (Расширенная вставка):   0.5950


In [None]:

# a) Удаляем столбцы с пропусками

# Разделяем на обучающий и тестовый набор
X_train_full, X_test_full, y_train, y_test =\
    train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Удаляем столбцы с недостающими значениями (самый простой подход)
cols_with_missing = X_train_full.columns[X_train_full.isnull().any()]

X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_test_full.drop(cols_with_missing, axis=1, inplace=True)


cardinality_cols = [cname for cname in X_train_full.columns
                        if X_train_full[cname].nunique() < 8 and
                        X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns
                  if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_test = X_test.select_dtypes(exclude=['object'])

MAE_cat_delete = score_dataset(drop_X_train, drop_X_test, y_train, y_test)
print(f"MAE (Удаление категориальных данных):   {MAE_cat_delete:.4f}")
results['(cat) Удаление данных'] = MAE_cat_delete

MAE (Удаление категориальных данных):   0.8276


In [None]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

# Создаем копию, чтобы не испортить исходные данные
label_X_train = X_train.copy()
label_X_test = X_test.copy()


type1 = label_X_test['Type'].copy()

# Применяем упорядоченную кодировку к категориальным данным
ordinal_encoder = OrdinalEncoder()
label_X_train[(object_cols)] = ordinal_encoder.fit_transform(X_train[(object_cols)])
label_X_test[(object_cols)] = ordinal_encoder.transform(X_test[(object_cols)])

type2 = label_X_test['Type'].copy()
comparison_df = pd.DataFrame({
    'Type1': type1,
    'Type2': type2
})
print(comparison_df.head())

MAE_coding = score_dataset(label_X_train, label_X_test, y_train, y_test)
print(f"MAE (Упорядоченное кодирование):   {MAE_coding:.4f}")
results['(cat) Упорядоченное кодирование'] = MAE_coding

       Type1  Type2
5257    Film    0.0
560     Film    0.0
5653  Series    1.0
1858    Film    0.0
2775    Film    0.0
MAE (Упорядоченное кодирование):   0.6672


In [None]:
# Кодирование категориальных признаков с помощью OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Применяем кодирование
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]), index=X_train.index)
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[object_cols]), index=X_test.index)

# Удаляем категориальные столбцы и объединяем с закодированными
num_X_train = X_train.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# Приводим имена столбцов к строковому типу
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

MAE_coding_OH = score_dataset(OH_X_train, OH_X_test, y_train, y_test)
print(f"MAE (Прямое кодирование):   {MAE_coding_OH:.4f}")
results['(cat) Прямое кодирование'] = MAE_coding_OH

MAE (Прямое кодирование):   0.6672


In [None]:
print("Результаты:")
max_length = max(len(key) for key in results.keys())
for key, value in results.items():
    print(f"{key.ljust(max_length)} : {value:.4f}")

Результаты:
(num) Удаление столбцов         : 0.8276
(num) Вставка                   : 0.5967
(num) Расширенная вставка       : 0.5950
(cat) Удаление данных           : 0.8276
(cat) Упорядоченное кодирование : 0.6672
(cat) Прямое кодирование        : 0.6672


In [None]:
numerical_transformer = SimpleImputer(strategy='most_frequent')

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, features_num),
        ('cat', categorical_transformer, features_cat)])


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=0)

def evaluate_model(pipeline):
    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_valid, y_valid)
    return score


# Случайный лес
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=0))
])

rf_score = evaluate_model(pipeline_rf)
print(f"Случайный лес:   \t{rf_score:.4f}")




# Случайный  лес с кросс-валидацией
pipeline_rf_cv = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=0))
])
rf_cv_score = cross_val_score(pipeline_rf_cv, X, y, cv=5).mean()
print(f"Случайный лес cross-val: {rf_cv_score:.4f}")



# Градиентный бустинг
pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=0))
])

xgb_score = evaluate_model(pipeline_xgb)
print(f"Градиентный бустинг:   \t{xgb_score:.4f}")


Случайный лес:   	0.5705
Случайный лес cross-val: 0.5152
Градиентный бустинг:   	0.4921
