In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier as RFC, GradientBoostingClassifier as GBC
from sklearn.tree import plot_tree, DecisionTreeClassifier as CART
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

import numpy as np
from scipy import stats
from ydata_profiling import ProfileReport

import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None

In [13]:
# Wczytanie danych
train_data = pd.read_csv('IPA.csv')
test_data = pd.read_csv('IPA_test.csv')


In [14]:
# Przekształcenie kolumny IsIPA na wartości liczbowe (0/1)
train_data['IsIPA'] = train_data['IsIPA'].astype(int)

In [15]:
# Podział danych na zbiór treningowy i walidacyjny
X_train, X_val, y_train, y_val = train_test_split(
    train_data.drop(['IsIPA', 'UserId'], axis=1),
    train_data['IsIPA'],
    test_size=0.2,
    random_state=42
)

In [16]:
dist = {'n_estimators': stats.randint(100, 400), 'subsample': stats.uniform()}
tuning_res_gbc = RandomizedSearchCV(GBC(random_state=42),
                                    param_distributions=dist,
                                    scoring='accuracy',
                                    n_iter=25,
                                    n_jobs=1,
                                    cv=3,
                                    verbose=2)
tuning_res_gbc.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] END .....n_estimators=153, subsample=0.2596745008740101; total time=   0.0s
[CV] END .....n_estimators=153, subsample=0.2596745008740101; total time=   0.0s
[CV] END .....n_estimators=153, subsample=0.2596745008740101; total time=   0.0s
[CV] END .....n_estimators=323, subsample=0.8077107411409182; total time=   0.0s
[CV] END .....n_estimators=323, subsample=0.8077107411409182; total time=   0.0s
[CV] END .....n_estimators=323, subsample=0.8077107411409182; total time=   0.0s
[CV] END ....n_estimators=230, subsample=0.49791904504962026; total time=   0.0s
[CV] END ....n_estimators=230, subsample=0.49791904504962026; total time=   0.0s
[CV] END ....n_estimators=230, subsample=0.49791904504962026; total time=   0.0s
[CV] END .....n_estimators=336, subsample=0.7639755509613092; total time=   0.0s
[CV] END .....n_estimators=336, subsample=0.7639755509613092; total time=   0.0s
[CV] END .....n_estimators=336, subsample=0.7639

ValueError: 
All the 75 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\aliak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\aliak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\ensemble\_gb.py", line 429, in fit
    X, y = self._validate_data(
  File "C:\Users\aliak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\aliak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\utils\validation.py", line 1106, in check_X_y
    X = check_array(
  File "C:\Users\aliak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\utils\validation.py", line 921, in check_array
    _assert_all_finite(
  File "C:\Users\aliak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\utils\validation.py", line 161, in _assert_all_finite
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
GradientBoostingClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


In [None]:
print(tuning_res_gbc.best_params_)
Best_GBT = tuning_res_gbc.best_estimator_

In [None]:
# Plot feature importance
feature_importance = Best_GBT.feature_importances_
# Make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
num_feat = 6

plt.figure(figsize=[12, 8])
plt.barh(pos[-num_feat:],
         feature_importance[sorted_idx][-num_feat:],
         align='center',
         alpha=0.75)
plt.yticks(pos[-num_feat:], X_train.columns[sorted_idx][-num_feat:])
plt.xlabel('Relative Importance')
plt.title('Variable Importance');

In [None]:
# Plot feature importance
feature_importance = Best_GBT.feature_importances_
# Make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
num_feat = 6

plt.figure(figsize=[12, 8])
plt.barh(pos[-num_feat:],
         feature_importance[sorted_idx][-num_feat:],
         align='center',
         alpha=0.75)
plt.yticks(pos[-num_feat:], X_train.columns[sorted_idx][-num_feat:])
plt.xlabel('Relative Importance')
plt.title('Variable Importance');

In [None]:
plt.bar(['CART', 'Random Forest', 'Gradient Boosted Trees'],
        accuracies_test,
        color=['red', 'green', 'blue'],
        alpha=0.75)
plt.ylabel('Accuracy');

In [None]:
# Wczytanie danych
train_data = pd.read_csv('IPA.csv')
test_data = pd.read_csv('IPA_test.csv')

# Przekształcenie kolumny IsIPA na wartości liczbowe (0/1)
train_data['IsIPA'] = train_data['IsIPA'].astype(int)

# Podział danych na zbiór treningowy i walidacyjny
X_train, X_val, y_train, y_val = train_test_split(
    train_data.drop(['IsIPA', 'UserId'], axis=1),
    train_data['IsIPA'],
    test_size=0.2,
    random_state=42
)

# Pipeline do obsługi brakujących danych, skalowania, i klasyfikacji
model_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    RandomForestClassifier(random_state=42)
)

# Hiperparametry do optymalizacji
param_grid = {
    'randomforestclassifier__n_estimators': [100, 200, 300],
    'randomforestclassifier__max_depth': [None, 10, 20, 30],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4]
}

# Optymalizacja hiperparametrów
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Wydrukowanie najlepszych parametrów
print("Best Parameters:", grid_search.best_params_)

# Predykcja na zbiorze walidacyjnym
y_val_pred = grid_search.predict(X_val)

# Wartość F1 na zbiorze walidacyjnym
f1_val = f1_score(y_val, y_val_pred)
print("F1 Score on Validation Set:", f1_val)

# Predykcja na zbiorze testowym
test_predictions = grid_search.predict(test_data.drop(['UserId'], axis=1))

# Zapisanie predykcji do pliku CSV
test_data['IsIPA'] = test_predictions
test_data[['UserId', 'IsIPA']].to_csv('IPA_predictions.csv', index=False)