In [41]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
df = pd.read_csv('data/covtype.csv')

In [4]:
df

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,...,0,0,0,0,0,0,0,0,0,3
581008,2391,152,19,67,12,95,240,237,119,845,...,0,0,0,0,0,0,0,0,0,3
581009,2386,159,17,60,7,90,236,241,130,854,...,0,0,0,0,0,0,0,0,0,3
581010,2384,170,15,60,5,90,230,245,143,864,...,0,0,0,0,0,0,0,0,0,3


In [5]:
X = df.drop(columns='Cover_Type')
y = df['Cover_Type']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = CatBoostClassifier()

model.load_model('/home/alex/study/big-data-labs/lab1/models/catboost_forest_cover_type.model')

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

model.save_model('catboost_forest_cover_type.model')

print(classification_report(y_test, y_pred))

Accuracy: 88.01%
              precision    recall  f1-score   support

           1       0.88      0.85      0.87     42368
           2       0.88      0.91      0.90     56661
           3       0.88      0.91      0.89      7151
           4       0.85      0.83      0.84       549
           5       0.87      0.58      0.69      1899
           6       0.85      0.77      0.80      3473
           7       0.93      0.88      0.90      4102

    accuracy                           0.88    116203
   macro avg       0.88      0.82      0.84    116203
weighted avg       0.88      0.88      0.88    116203



In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass', cat_features=[])

model.fit(X_train_scaled, y_train, verbose=200)

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

model.save_model('catboost_forest_cover_type.model')

print(classification_report(y_test, y_pred))

0:	learn: 1.6764241	total: 220ms	remaining: 3m 39s
200:	learn: 0.4727552	total: 21.2s	remaining: 1m 24s
400:	learn: 0.3970818	total: 41.8s	remaining: 1m 2s
600:	learn: 0.3548123	total: 1m 3s	remaining: 41.9s
800:	learn: 0.3244698	total: 1m 24s	remaining: 21s
999:	learn: 0.3009443	total: 1m 46s	remaining: 0us
Accuracy: 88.01%
              precision    recall  f1-score   support

           1       0.88      0.85      0.87     42368
           2       0.88      0.91      0.90     56661
           3       0.88      0.91      0.89      7151
           4       0.85      0.83      0.84       549
           5       0.87      0.58      0.69      1899
           6       0.85      0.77      0.80      3473
           7       0.93      0.88      0.90      4102

    accuracy                           0.88    116203
   macro avg       0.88      0.82      0.84    116203
weighted avg       0.88      0.88      0.88    116203



In [None]:
train_df = pd.DataFrame(X_train, columns=X.columns)
train_df['Cover_Type'] = y_train

X_test_1, X_test_2, y_test_1, y_test_2 = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)
test_1_df = pd.DataFrame(X_test_1, columns=X.columns)
test_1_df['Cover_Type'] = y_test_1

test_2_df = pd.DataFrame(X_test_2, columns=X.columns)
test_2_df['Cover_Type'] = y_test_2

train_df.to_csv('data/train.csv', index=False)
test_1_df.to_csv('data/test_1.csv', index=False)
test_2_df.to_csv('data/test_2.csv', index=False)

In [29]:
sc = StandardScaler()
X_train = pd.read_csv('data/train.csv').drop(columns=['Unnamed: 0', 'Cover_Type'])
df_test = pd.read_csv('data/test_1.csv').drop(columns=['Unnamed: 0'])
X_test = df_test.drop(columns=['Cover_Type'])
y_test = df_test['Cover_Type']
sc.fit(X_train)
X_test_norm = sc.transform(X_test)

In [30]:
X_test

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,2996,239,13,417,106,5314,196,251,191,994,...,0,0,0,0,0,0,0,0,0,0
1,2902,76,4,390,13,3974,225,232,143,3693,...,0,0,0,0,0,0,0,0,0,0
2,3005,170,4,513,56,2733,223,241,154,1501,...,0,1,0,0,0,0,0,0,0,0
3,3212,190,12,283,-4,3620,220,249,161,1652,...,0,1,0,0,0,0,0,0,0,0
4,2995,190,18,283,132,6340,217,251,160,4110,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58096,3262,270,19,201,39,3512,168,243,213,1806,...,0,0,0,0,0,0,0,0,0,0
58097,2784,232,27,242,158,713,166,252,212,1637,...,0,0,0,0,0,0,0,0,0,0
58098,2734,42,18,124,-4,1190,220,199,108,2253,...,0,0,0,0,0,0,0,0,0,0
58099,2345,130,22,295,30,391,250,221,88,1087,...,0,0,0,0,0,0,0,0,0,0


In [31]:
X_test_norm

array([[ 0.13030754,  0.74273697, -0.14752728, ..., -0.16570694,
        -0.156514  , -0.12434232],
       [-0.20537704, -0.71288447, -1.34958427, ..., -0.16570694,
        -0.156514  , -0.12434232],
       [ 0.16244756,  0.12655366, -1.34958427, ..., -0.16570694,
        -0.156514  , -0.12434232],
       ...,
       [-0.80532395, -1.01651103,  0.52028216, ..., -0.16570694,
        -0.156514  , -0.12434232],
       [-2.19448675, -0.23065406,  1.05452971, ..., -0.16570694,
        -0.156514  , -0.12434232],
       [-0.15538146, -1.23083566, -0.14752728, ..., -0.16570694,
        -0.156514  , -0.12434232]])

In [34]:
model = CatBoostClassifier()
model.load_model('models/catboost_forest_cover_type.model')

preds = model.predict(X_test_norm[:64])
from sklearn.metrics import accuracy_score

accuracy_score(y_test[:64], preds)

0.921875

In [38]:
print(list(y_test[:64]))

[2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 7, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 5, 2, 1, 1, 2, 1, 2, 1, 4, 1, 3, 2, 2, 2, 1, 2, 2, 2, 6, 1, 2, 1]


In [44]:
print(preds.flatten(), np.array(y_test[:64]).flatten(), sep='\n')

[2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 2 2 2 2 1 2 1 1 2 7 2 1 1 2 1 1 1 2 2 1 2 2
 2 2 1 2 2 1 2 1 2 2 1 2 1 4 1 3 1 2 2 1 2 2 2 6 1 2 1]
[2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 1 2 7 2 1 1 2 1 1 1 2 2 1 1 2
 2 2 1 2 2 5 2 1 1 2 1 2 1 4 1 3 2 2 2 1 2 2 2 6 1 2 1]
