In [1]:
!pip install catboost lightgbm xgboost scikit-learn pandas
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score
import joblib

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


# education class 1:  personal_development

In [None]:
df = pd.read_csv('personal development edu.csv')

df_filtered = df[df['education'] == 1].copy()

if len(df_filtered) == 0:
    raise ValueError("Нет данных с education=1. Проверьте входные данные.")

X = df_filtered.drop(['personal development', 'building_id', 'y', 'x'], axis=1)
y = df_filtered['personal development']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

param_grids = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7, None],
            'min_samples_split': [2, 5],
            'class_weight': ['balanced', None]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'scale_pos_weight': [1, (y_train == 0).sum()/(y_train == 1).sum()]
        }
    },
    'LightGBM': {
        'model': LGBMClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1],
            'class_weight': ['balanced', None]
        }
    }
}


catboost_params = {
    'iterations': [50, 100],
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'auto_class_weights': ['Balanced', None]
}

results = {}


for name, config in param_grids.items():
    print(f"\nПодбор параметров для {name}...")
    grid_search = GridSearchCV(
        estimator=config['model'],
        param_grid=config['params'],
        cv=3,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    joblib.dump(best_model, f'{name.lower().replace(" ", "_")}_best_model.pkl')

    results[name] = {
        'best_params': grid_search.best_params_,
        'f1_score': f1_score(y_test, y_pred),
        'accuracy': accuracy_score(y_test, y_pred),
        'report': classification_report(y_test, y_pred, zero_division=0)
    }

    print(f"Лучшие параметры {name}: {grid_search.best_params_}")
    print(f"F1-score: {results[name]['f1_score']:.4f}")
    print(f"Accuracy: {results[name]['accuracy']:.4f}")


print("\nПодбор параметров для CatBoost...")
cat_features = list(X.select_dtypes(include=['object', 'category']).columns)
grid_search_cb = GridSearchCV(
    estimator=CatBoostClassifier(
        random_state=42,
        cat_features=cat_features,
        verbose=0,
        eval_metric='F1'
    ),
    param_grid=catboost_params,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)
grid_search_cb.fit(X_train, y_train)

best_cb = grid_search_cb.best_estimator_
y_pred_cb = best_cb.predict(X_test)

joblib.dump(best_cb, 'catboost_best_model.pkl')

results['CatBoost'] = {
    'best_params': grid_search_cb.best_params_,
    'f1_score': f1_score(y_test, y_pred_cb),
    'accuracy': accuracy_score(y_test, y_pred_cb),
    'report': classification_report(y_test, y_pred_cb, zero_division=0)
}

print("\nРезультаты подбора параметров (оптимизация по F1-score):")
for name, res in results.items():
    print(f"\n{name}:")
    print(f"Лучшие параметры: {res['best_params']}")
    print(f"F1-score: {res['f1_score']:.4f}")
    print(f"Accuracy: {res['accuracy']:.4f}")
    print("Отчет классификации:\n", res['report'])

best_model_name = max(results, key=lambda x: results[x]['f1_score'])
best_model = joblib.load(f'{best_model_name.lower().replace(" ", "_")}_best_model.pkl')
joblib.dump(best_model, 'best_overall_model.pkl')
print(f"\nЛучшая модель: {best_model_name} с F1-score {results[best_model_name]['f1_score']:.4f}")


Подбор параметров для Random Forest...
Fitting 3 folds for each of 48 candidates, totalling 144 fits
Лучшие параметры Random Forest: {'class_weight': None, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 50}
F1-score: 0.9394
Accuracy: 0.8857

Подбор параметров для XGBoost...
Fitting 3 folds for each of 54 candidates, totalling 162 fits


Parameters: { "use_label_encoder" } are not used.



Лучшие параметры XGBoost: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'scale_pos_weight': 1}
F1-score: 0.9394
Accuracy: 0.8857

Подбор параметров для LightGBM...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
[LightGBM] [Info] Number of positive: 115, number of negative: 21
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 136, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.845588 -> initscore=1.700410
[LightGBM] [Info] Start training from score 1.700410
Лучшие параметры LightGBM: {'class_weight': None, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
F1-score: 0.9394
Accuracy: 0.8857

Подбор параметров для CatBoost...
Fitting 3 folds for each of 24 cand

36 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/catboost/core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/usr/local/lib/python3.11/dist-packages/catboost/core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File 

In [None]:
best_model = joblib.load('catboost_best_model.pkl')

X_all = df_filtered.drop(['personal development', 'building_id', 'y', 'x'], axis=1)
predictions = best_model.predict(X_all)
probabilities = best_model.predict_proba(X_all)[:, 1]

results_df = df_filtered.copy()

results_df['predicted_class'] = predictions
results_df['prediction_probability'] = probabilities

results_df.to_csv('qgis_full_features_export.csv', index=False)
print("Результаты со всеми признаками сохранены в qgis_full_features_export.csv")

Результаты со всеми признаками сохранены в qgis_full_features_export.csv


application to new data

In [None]:
new_data = pd.read_csv('data_high14.csv')

new_data_filtered = new_data[new_data['education'] == 1].copy()

best_model = joblib.load('catboost_best_model.pkl')

cols_to_drop = ['personal development', 'building_id', 'y', 'x']
X_new = new_data_filtered.drop([col for col in cols_to_drop if col in new_data_filtered.columns], axis=1)

predictions = best_model.predict(X_new)
probabilities = best_model.predict_proba(X_new)[:, 1]

full_results = new_data.copy()

full_results['predicted_class'] = 0
full_results['prediction_probability'] = 0
full_results.loc[new_data['education'] == 1, 'predicted_class'] = predictions
full_results.loc[new_data['education'] == 1, 'prediction_probability'] = probabilities

full_results.to_csv('full_predictions_with_all_data.csv', index=False)
print("Полные результаты сохранены в full_predictions_with_all_data.csv")

if {'y', 'x'}.issubset(full_results.columns):
    qgis_cols = ['building_id', 'y', 'x', 'predicted_class', 'prediction_probability'] + \
                [col for col in new_data.columns if col not in cols_to_drop]
    full_results[qgis_cols].to_csv('qgis_full_export.csv', index=False)
    print("Данные для QGIS сохранены в qgis_full_export.csv")

#  education class 2: else_edu

In [None]:
df = pd.read_csv('else_edu.csv')

df_filtered = df[df['education'] == 1].copy()

if len(df_filtered) == 0:
    raise ValueError("Нет данных с education=1. Проверьте входные данные.")

X = df_filtered.drop(['else', 'building_id', 'y', 'x'], axis=1)
y = df_filtered['else']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

joblib.dump(X_train.columns.tolist(), 'train_features.pkl')

param_grids = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7, None],
            'min_samples_split': [2, 5],
            'class_weight': ['balanced', None]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'scale_pos_weight': [1, (y_train == 0).sum()/(y_train == 1).sum()]
        }
    },
    'LightGBM': {
        'model': LGBMClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1],
            'class_weight': ['balanced', None]
        }
    }
}

catboost_params = {
    'iterations': [50, 100],
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'auto_class_weights': ['Balanced', None]
}

results = {}

for name, config in param_grids.items():
    print(f"\nПодбор параметров для {name}...")
    grid_search = GridSearchCV(
        estimator=config['model'],
        param_grid=config['params'],
        cv=3,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    joblib.dump(best_model, f'{name.lower().replace(" ", "_")}_best_model1.pkl')

    results[name] = {
        'best_params': grid_search.best_params_,
        'f1_score': f1_score(y_test, y_pred),
        'accuracy': accuracy_score(y_test, y_pred),
        'report': classification_report(y_test, y_pred, zero_division=0)
    }

    print(f"Лучшие параметры {name}: {grid_search.best_params_}")
    print(f"F1-score: {results[name]['f1_score']:.4f}")
    print(f"Accuracy: {results[name]['accuracy']:.4f}")

print("\nПодбор параметров для CatBoost...")
cat_features = list(X.select_dtypes(include=['object', 'category']).columns)
grid_search_cb = GridSearchCV(
    estimator=CatBoostClassifier(
        random_state=42,
        cat_features=cat_features,
        verbose=0,
        eval_metric='F1'
    ),
    param_grid=catboost_params,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)
grid_search_cb.fit(X_train, y_train)

best_cb = grid_search_cb.best_estimator_
y_pred_cb = best_cb.predict(X_test)

joblib.dump(best_cb, 'catboost_best_model1.pkl')

results['CatBoost'] = {
    'best_params': grid_search_cb.best_params_,
    'f1_score': f1_score(y_test, y_pred_cb),
    'accuracy': accuracy_score(y_test, y_pred_cb),
    'report': classification_report(y_test, y_pred_cb, zero_division=0)
}

print("\nРезультаты подбора параметров (оптимизация по F1-score):")
for name, res in results.items():
    print(f"\n{name}:")
    print(f"Лучшие параметры: {res['best_params']}")
    print(f"F1-score: {res['f1_score']:.4f}")
    print(f"Accuracy: {res['accuracy']:.4f}")
    print("Отчет классификации:\n", res['report'])

best_model_name = max(results, key=lambda x: results[x]['f1_score'])
best_model = joblib.load(f'{best_model_name.lower().replace(" ", "_")}_best_model1.pkl')
joblib.dump(best_model, 'best_overall_model1.pkl')
print(f"\nЛучшая модель: {best_model_name} с F1-score {results[best_model_name]['f1_score']:.4f}")


Подбор параметров для Random Forest...
Fitting 3 folds for each of 48 candidates, totalling 144 fits
Лучшие параметры Random Forest: {'class_weight': None, 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 200}
F1-score: 0.9091
Accuracy: 0.9714

Подбор параметров для XGBoost...
Fitting 3 folds for each of 54 candidates, totalling 162 fits


Parameters: { "use_label_encoder" } are not used.



Лучшие параметры XGBoost: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'scale_pos_weight': 1}
F1-score: 0.9091
Accuracy: 0.9714

Подбор параметров для LightGBM...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
[LightGBM] [Info] Number of positive: 27, number of negative: 109
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57
[LightGBM] [Info] Number of data points in the train set: 136, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Лучшие параметры LightGBM: {'class_weight': 'balanced', 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
F1-score: 0.9091
Accuracy: 0.9714

Подбор параметров для CatBoost...
Fitting 3 folds for each o

36 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/catboost/core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/usr/local/lib/python3.11/dist-packages/catboost/core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File 

In [None]:
best_model = joblib.load('best_overall_model1.pkl')

X_all = df_filtered.drop(['else', 'building_id', 'y', 'x'], axis=1)
predictions = best_model.predict(X_all)
probabilities = best_model.predict_proba(X_all)[:, 1]

results_df = df_filtered.copy()

results_df['predicted_class'] = predictions
results_df['prediction_probability'] = probabilities

results_df.to_csv('qgis_full_features_export1.csv', index=False)
print("Результаты со всеми признаками сохранены в qgis_full_features_export1.csv")

print("\nПроверка содержимого CSV:")
print(pd.read_csv('qgis_full_features_export1.csv').columns)

Результаты со всеми признаками сохранены в qgis_full_features_export1.csv

Проверка содержимого CSV:
Index(['personal_development', 'else', 'education', 'food', 'commerce',
       'finance', 'health', 'recreation', 'kindergarten', 'school', 'edu_buf',
       'food_buf', 'fin_buf', 'commerce_buf', 'health_buf', 'subway_buf',
       'bus_buf', 'tram_stop', 'building_id', 'y', 'x', 'predicted_class',
       'prediction_probability'],
      dtype='object')


application to new data

In [None]:
new_data = pd.read_csv('data_high15.csv')

new_data_filtered = new_data[new_data['education'] == 1].copy()

best_model = joblib.load('best_overall_model1.pkl')

cols_to_drop = ['else', 'building_id', 'y', 'x']
X_new = new_data_filtered.drop([col for col in cols_to_drop if col in new_data_filtered.columns], axis=1)

predictions = best_model.predict(X_new)
probabilities = best_model.predict_proba(X_new)[:, 1]

full_results = new_data.copy()

full_results['predicted_class'] = 0
full_results['prediction_probability'] = 0
full_results.loc[new_data['education'] == 1, 'predicted_class'] = predictions
full_results.loc[new_data['education'] == 1, 'prediction_probability'] = probabilities

full_results.to_csv('full_predictions_with_all_data1.csv', index=False)
print("Полные результаты сохранены в full_predictions_with_all_data1.csv")

if {'y', 'x'}.issubset(full_results.columns):
    qgis_cols = ['building_id', 'y', 'x', 'predicted_class', 'prediction_probability'] + \
                [col for col in new_data.columns if col not in cols_to_drop]
    full_results[qgis_cols].to_csv('qgis_full_export1.csv', index=False)
    print("Данные для QGIS сохранены в qgis_full_export1.csv")

# Food|eat

In [None]:
df = pd.read_csv('eat.csv')

df_filtered = df[df['food'] == 1].copy()

if len(df_filtered) == 0:
    raise ValueError("Нет данных с food=1. Проверьте входные данные.")

X = df_filtered.drop(['eat', 'building_id', 'y', 'x'], axis=1)
y = df_filtered['eat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

joblib.dump(X_train.columns.tolist(), 'train_features_eat.pkl')

param_grids = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7, None],
            'min_samples_split': [2, 5],
            'class_weight': ['balanced', None]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'scale_pos_weight': [1, (y_train == 0).sum()/(y_train == 1).sum()]
        }
    },
    'LightGBM': {
        'model': LGBMClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1],
            'class_weight': ['balanced', None]
        }
    }
}

catboost_params = {
    'iterations': [50, 100],
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'auto_class_weights': ['Balanced', None]
}

results = {}

for name, config in param_grids.items():
    print(f"\nПодбор параметров для {name}...")
    grid_search = GridSearchCV(
        estimator=config['model'],
        param_grid=config['params'],
        cv=3,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    joblib.dump(best_model, f'{name.lower().replace(" ", "_")}_best_model_eat.pkl')

    results[name] = {
        'best_params': grid_search.best_params_,
        'f1_score': f1_score(y_test, y_pred),
        'accuracy': accuracy_score(y_test, y_pred),
        'report': classification_report(y_test, y_pred, zero_division=0)
    }

    print(f"Лучшие параметры {name}: {grid_search.best_params_}")
    print(f"F1-score: {results[name]['f1_score']:.4f}")
    print(f"Accuracy: {results[name]['accuracy']:.4f}")

print("\nПодбор параметров для CatBoost...")
cat_features = list(X.select_dtypes(include=['object', 'category']).columns)
grid_search_cb = GridSearchCV(
    estimator=CatBoostClassifier(
        random_state=42,
        cat_features=cat_features,
        verbose=0,
        eval_metric='F1'
    ),
    param_grid=catboost_params,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)
grid_search_cb.fit(X_train, y_train)

best_cb = grid_search_cb.best_estimator_
y_pred_cb = best_cb.predict(X_test)

joblib.dump(best_cb, 'catboost_best_model_eat.pkl')

results['CatBoost'] = {
    'best_params': grid_search_cb.best_params_,
    'f1_score': f1_score(y_test, y_pred_cb),
    'accuracy': accuracy_score(y_test, y_pred_cb),
    'report': classification_report(y_test, y_pred_cb, zero_division=0)
}

print("\nРезультаты подбора параметров (оптимизация по F1-score):")
for name, res in results.items():
    print(f"\n{name}:")
    print(f"Лучшие параметры: {res['best_params']}")
    print(f"F1-score: {res['f1_score']:.4f}")
    print(f"Accuracy: {res['accuracy']:.4f}")
    print("Отчет классификации:\n", res['report'])

best_model_name = max(results, key=lambda x: results[x]['f1_score'])
best_model = joblib.load(f'{best_model_name.lower().replace(" ", "_")}_best_model_eat.pkl')
joblib.dump(best_model, 'best_overall_model_eat.pkl')
print(f"\nЛучшая модель: {best_model_name} с F1-score {results[best_model_name]['f1_score']:.4f}")

In [None]:
best_model = joblib.load('best_overall_model_eat.pkl')

X_all = df_filtered.drop(['eat', 'building_id', 'y', 'x'], axis=1)
predictions = best_model.predict(X_all)
probabilities = best_model.predict_proba(X_all)[:, 1]

results_df = df_filtered.copy()

results_df['predicted_class'] = predictions
results_df['prediction_probability'] = probabilities

results_df.to_csv('qgis_full_features_eat.csv', index=False)
print("Результаты со всеми признаками сохранены в qgis_full_features_eat.csv")

print("\nПроверка содержимого CSV:")
print(pd.read_csv('qgis_full_features_eat.csv').columns)

application to new data

In [None]:
new_data = pd.read_csv('data_high16.csv')

new_data_filtered = new_data[new_data['food'] == 1].copy()

best_model = joblib.load('best_overall_model_eat.pkl')

cols_to_drop = ['eat', 'building_id', 'y', 'x']
X_new = new_data_filtered.drop([col for col in cols_to_drop if col in new_data_filtered.columns], axis=1)

predictions = best_model.predict(X_new)
probabilities = best_model.predict_proba(X_new)[:, 1]

full_results = new_data.copy()

full_results['predicted_class'] = 0
full_results['prediction_probability'] = 0
full_results.loc[new_data['education'] == 1, 'predicted_class'] = predictions
full_results.loc[new_data['education'] == 1, 'prediction_probability'] = probabilities

full_results.to_csv('full_predictions_with_all_data_eat.csv', index=False)
print("Полные результаты сохранены в full_predictions_with_all_data_eat.csv")

if {'y', 'x'}.issubset(full_results.columns):
    qgis_cols = ['building_id', 'y', 'x', 'predicted_class', 'prediction_probability'] + \
                [col for col in new_data.columns if col not in cols_to_drop]
    full_results[qgis_cols].to_csv('qgis_full_export_eat.csv', index=False)
    print("Данные для QGIS сохранены в qgis_full_export_eat.csv")

# Food|drink

In [None]:
df = pd.read_csv('drink.csv')

df_filtered = df[df['food'] == 1].copy()

if len(df_filtered) == 0:
    raise ValueError("Нет данных с food=1. Проверьте входные данные.")

X = df_filtered.drop(['drink', 'building_id', 'y', 'x'], axis=1)
y = df_filtered['drink']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

joblib.dump(X_train.columns.tolist(), 'train_features_drink.pkl')

param_grids = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7, None],
            'min_samples_split': [2, 5],
            'class_weight': ['balanced', None]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'scale_pos_weight': [1, (y_train == 0).sum()/(y_train == 1).sum()]
        }
    },
    'LightGBM': {
        'model': LGBMClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1],
            'class_weight': ['balanced', None]
        }
    }
}

catboost_params = {
    'iterations': [50, 100],
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'auto_class_weights': ['Balanced', None]
}

results = {}

for name, config in param_grids.items():
    print(f"\nПодбор параметров для {name}...")
    grid_search = GridSearchCV(
        estimator=config['model'],
        param_grid=config['params'],
        cv=3,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    joblib.dump(best_model, f'{name.lower().replace(" ", "_")}_best_model_drink.pkl')

    results[name] = {
        'best_params': grid_search.best_params_,
        'f1_score': f1_score(y_test, y_pred),
        'accuracy': accuracy_score(y_test, y_pred),
        'report': classification_report(y_test, y_pred, zero_division=0)
    }

    print(f"Лучшие параметры {name}: {grid_search.best_params_}")
    print(f"F1-score: {results[name]['f1_score']:.4f}")
    print(f"Accuracy: {results[name]['accuracy']:.4f}")

print("\nПодбор параметров для CatBoost...")
cat_features = list(X.select_dtypes(include=['object', 'category']).columns)
grid_search_cb = GridSearchCV(
    estimator=CatBoostClassifier(
        random_state=42,
        cat_features=cat_features,
        verbose=0,
        eval_metric='F1'
    ),
    param_grid=catboost_params,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)
grid_search_cb.fit(X_train, y_train)

best_cb = grid_search_cb.best_estimator_
y_pred_cb = best_cb.predict(X_test)

joblib.dump(best_cb, 'catboost_best_model_drink.pkl')

results['CatBoost'] = {
    'best_params': grid_search_cb.best_params_,
    'f1_score': f1_score(y_test, y_pred_cb),
    'accuracy': accuracy_score(y_test, y_pred_cb),
    'report': classification_report(y_test, y_pred_cb, zero_division=0)
}

print("\nРезультаты подбора параметров (оптимизация по F1-score):")
for name, res in results.items():
    print(f"\n{name}:")
    print(f"Лучшие параметры: {res['best_params']}")
    print(f"F1-score: {res['f1_score']:.4f}")
    print(f"Accuracy: {res['accuracy']:.4f}")
    print("Отчет классификации:\n", res['report'])

best_model_name = max(results, key=lambda x: results[x]['f1_score'])
best_model = joblib.load(f'{best_model_name.lower().replace(" ", "_")}_best_model_drink.pkl')
joblib.dump(best_model, 'best_overall_model_drink.pkl')
print(f"\nЛучшая модель: {best_model_name} с F1-score {results[best_model_name]['f1_score']:.4f}")

In [None]:
best_model = joblib.load('best_overall_model_drink.pkl')

X_all = df_filtered.drop(['eat', 'building_id', 'y', 'x'], axis=1)
predictions = best_model.predict(X_all)
probabilities = best_model.predict_proba(X_all)[:, 1]

results_df = df_filtered.copy()

results_df['predicted_class'] = predictions
results_df['prediction_probability'] = probabilities

results_df.to_csv('qgis_full_features_drink.csv', index=False)
print("Результаты со всеми признаками сохранены в qgis_full_features_drink.csv")

print("\nПроверка содержимого CSV:")
print(pd.read_csv('qgis_full_features_drink.csv').columns)

application to new data

In [None]:
new_data = pd.read_csv('data_high17.csv')

new_data_filtered = new_data[new_data['food'] == 1].copy()

best_model = joblib.load('best_overall_model_drink.pkl')

cols_to_drop = ['drink', 'building_id', 'y', 'x']
X_new = new_data_filtered.drop([col for col in cols_to_drop if col in new_data_filtered.columns], axis=1)

predictions = best_model.predict(X_new)
probabilities = best_model.predict_proba(X_new)[:, 1]

full_results = new_data.copy()

full_results['predicted_class'] = 0
full_results['prediction_probability'] = 0
full_results.loc[new_data['education'] == 1, 'predicted_class'] = predictions
full_results.loc[new_data['education'] == 1, 'prediction_probability'] = probabilities

full_results.to_csv('full_predictions_with_all_data_drink.csv', index=False)
print("Полные результаты сохранены в full_predictions_with_all_data_drink.csv")

if {'y', 'x'}.issubset(full_results.columns):
    qgis_cols = ['building_id', 'y', 'x', 'predicted_class', 'prediction_probability'] + \
                [col for col in new_data.columns if col not in cols_to_drop]
    full_results[qgis_cols].to_csv('qgis_full_export_drink.csv', index=False)
    print("Данные для QGIS сохранены в qgis_full_export_drink.csv")

The code is then generated similarly, requiring only the target variable to be defined and additional input data to be specified. The choice of target depends on the study's goals.