In [1]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

class VacationModel:
    def __init__(self):
        # Инициализация параметров
        self.cities = ['Bishkek', 'Almaty', 'Moscow', 'London', 'New York']
        self.vacation_types = ['Shopping', 'Beach holiday', 'Cultural', 'Adventure']
        self.transport_types = ['auto', 'plane', 'train', 'ship']
        self.target_cities = ['London', 'Moscow', 'New York', 'Paris']
        self.model = RandomForestClassifier(n_estimators=100, random_state=42)

    def generate_data(self, num_samples=1000):
        # Генерация искусственного датасета
        data = []
        for _ in range(num_samples):
            salary = random.randint(30000, 120000)
            city = random.choice(self.cities)
            age = random.randint(30, 65)
            vacation_prefer = random.choice(self.vacation_types)
            transport_prefer = random.choice(self.transport_types)
            target = random.choice(self.target_cities)
            data.append([salary, city, age, vacation_prefer, transport_prefer, target])
        
        df = pd.DataFrame(data, columns=['salary', 'city', 'age', 'vacation_prefer', 'transport_prefer', 'target'])
        return df

    def preprocess_data(self, df):
        # Преобразуем категориальные переменные в числовые
        df_encoded = pd.get_dummies(df.drop('target', axis=1), drop_first=True)
        X = df_encoded
        y = df['target']
        return X, y

    def train_test_split(self, X, y):
        # Разделение данных на обучающие и тестовые выборки
        return train_test_split(X, y, test_size=0.3, random_state=42)

    def train_model(self, X_train, y_train):
        # Обучение модели
        self.model.fit(X_train, y_train)

    def evaluate_model(self, X_test, y_test):
        # Оценка модели
        y_pred = self.model.predict(X_test)
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        print("Classification Report:\n", classification_report(y_test, y_pred))

    def cross_validation(self, X, y):
        # Кросс-валидация
        cv_scores = cross_val_score(self.model, X, y, cv=5)
        print("Cross-validation scores:", cv_scores)
        print(f"Mean CV score: {cv_scores.mean():.4f}")

    def grid_search(self, X_train, y_train):
        # Решетчатый поиск для подбора гиперпараметров
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }
        grid_search = GridSearchCV(estimator=self.model, param_grid=param_grid, cv=5)
        grid_search.fit(X_train, y_train)
        print("Best parameters found by GridSearchCV:", grid_search.best_params_)
        self.model = grid_search.best_estimator_

    def predict_random_data(self, X):
        # Прогнозирование на случайных данных
        random_data = pd.DataFrame({
            'salary': [random.randint(30000, 120000)],
            'age': [random.randint(30, 65)],
            'vacation_prefer_Beach holiday': [1],
            'vacation_prefer_Cultural': [0],
            'vacation_prefer_Shopping': [0],
            'vacation_prefer_Adventure': [0],
            'transport_prefer_auto': [1],
            'transport_prefer_plane': [0],
            'transport_prefer_train': [0],
            'transport_prefer_ship': [0],
            'city_Almaty': [0],
            'city_Bishkek': [0],
            'city_London': [0],
            'city_Moscow': [0],
            'city_New York': [1]  # Пример города
        })
        
        missing_cols = set(X.columns) - set(random_data.columns)
        for col in missing_cols:
            random_data[col] = 0

        # Приводим порядок столбцов к обучающему набору
        random_data = random_data[X.columns]
        random_prediction = self.model.predict(random_data)
        print(f"Predicted vacation city: {random_prediction[0]}")

# Основной блок
if __name__ == "__main__":
    vacation_model = VacationModel()

    # Генерация данных
    df = vacation_model.generate_data()

    # Преобработка данных
    X, y = vacation_model.preprocess_data(df)

    # Разделение данных
    X_train, X_test, y_train, y_test = vacation_model.train_test_split(X, y)

    # Обучение модели
    vacation_model.train_model(X_train, y_train)

    # Оценка модели
    vacation_model.evaluate_model(X_test, y_test)

    # Кросс-валидация
    vacation_model.cross_validation(X, y)

    # Гиперпараметрический поиск
    vacation_model.grid_search(X_train, y_train)

    # Прогнозирование на случайных данных
    vacation_model.predict_random_data(X)


Accuracy: 0.2500
Classification Report:
               precision    recall  f1-score   support

      London       0.23      0.27      0.25        75
      Moscow       0.32      0.27      0.29        82
    New York       0.29      0.25      0.27        79
       Paris       0.17      0.20      0.19        64

    accuracy                           0.25       300
   macro avg       0.25      0.25      0.25       300
weighted avg       0.26      0.25      0.25       300

Cross-validation scores: [0.25  0.275 0.245 0.22  0.24 ]
Mean CV score: 0.2460
Best parameters found by GridSearchCV: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Predicted vacation city: London
