In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('data/covtype.csv')

In [3]:
X = df.drop(columns='Cover_Type')
y = df['Cover_Type']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = CatBoostClassifier()

model.load_model('/home/alex/study/big-data-labs/lab1/models/catboost_forest_cover_type.model')

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

model.save_model('catboost_forest_cover_type.model')

print(classification_report(y_test, y_pred))

Accuracy: 88.01%
              precision    recall  f1-score   support

           1       0.88      0.85      0.87     42368
           2       0.88      0.91      0.90     56661
           3       0.88      0.91      0.89      7151
           4       0.85      0.83      0.84       549
           5       0.87      0.58      0.69      1899
           6       0.85      0.77      0.80      3473
           7       0.93      0.88      0.90      4102

    accuracy                           0.88    116203
   macro avg       0.88      0.82      0.84    116203
weighted avg       0.88      0.88      0.88    116203



In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass', cat_features=[])

model.fit(X_train_scaled, y_train, verbose=200)

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

model.save_model('catboost_forest_cover_type.model')

print(classification_report(y_test, y_pred))

0:	learn: 1.6764241	total: 220ms	remaining: 3m 39s
200:	learn: 0.4727552	total: 21.2s	remaining: 1m 24s
400:	learn: 0.3970818	total: 41.8s	remaining: 1m 2s
600:	learn: 0.3548123	total: 1m 3s	remaining: 41.9s
800:	learn: 0.3244698	total: 1m 24s	remaining: 21s
999:	learn: 0.3009443	total: 1m 46s	remaining: 0us
Accuracy: 88.01%
              precision    recall  f1-score   support

           1       0.88      0.85      0.87     42368
           2       0.88      0.91      0.90     56661
           3       0.88      0.91      0.89      7151
           4       0.85      0.83      0.84       549
           5       0.87      0.58      0.69      1899
           6       0.85      0.77      0.80      3473
           7       0.93      0.88      0.90      4102

    accuracy                           0.88    116203
   macro avg       0.88      0.82      0.84    116203
weighted avg       0.88      0.88      0.88    116203



In [None]:
train_df = pd.DataFrame(X_train, columns=X.columns)
train_df['Cover_Type'] = y_train

X_test_1, X_test_2, y_test_1, y_test_2 = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)
test_1_df = pd.DataFrame(X_test_1, columns=X.columns)
test_1_df['Cover_Type'] = y_test_1

test_2_df = pd.DataFrame(X_test_2, columns=X.columns)
test_2_df['Cover_Type'] = y_test_2

train_df.to_csv('data/train.csv', index=False)
test_1_df.to_csv('data/test_1.csv', index=False)
test_2_df.to_csv('data/test_2.csv', index=False)