In [66]:
import pandas as pd #pandas for reading data and working with dataframes
import numpy as np #numpy for numerical computations and type casting
from sklearn.utils import shuffle #shuffle for picking random test data
from sklearn.preprocessing import MinMaxScaler #MinMaxScaler for scaling all data between zero and one
from sklearn.ensemble import RandomForestClassifier #RandomForestClassifier for our random forest model
from sklearn.metrics import accuracy_score #accuracy_score for calculating final accurancy of our models
from sklearn.pipeline import Pipeline #Pinpeline for easier cross validation
from sklearn.impute import SimpleImputer #SimpleImputer for pipeline
from sklearn.model_selection import cross_validate #cross_val_score for easier cross validation
from sklearn.tree import DecisionTreeClassifier #DecisionTreeClassifier for our decision tree model
from sklearn.metrics import confusion_matrix #confusion_matrix to Calculate confusion matrix

In [67]:
#Reading dataset from covtype.csv into df
df = pd.read_csv("../Dataset/covtype.csv")
print(df.head)

<bound method NDFrame.head of         Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
0            2596      51      3                               258   
1            2590      56      2                               212   
2            2804     139      9                               268   
3            2785     155     18                               242   
4            2595      45      2                               153   
...           ...     ...    ...                               ...   
581007       2396     153     20                                85   
581008       2391     152     19                                67   
581009       2386     159     17                                60   
581010       2384     170     15                                60   
581011       2383     165     13                                60   

        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                                    0                

In [68]:
#Shuffeling data set so that picking last 30% of objects been equal to
#randomly picking 30% of dataset for test data
df = shuffle(df)
print(df.head)

<bound method NDFrame.head of         Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
513506       3232      75     23                               210   
460481       3035     259     20                               108   
15880        2601      27      5                               234   
488723       3210     205      4                               362   
139428       3175     220      4                               624   
...           ...     ...    ...                               ...   
386052       3312     210     11                               108   
130832       3062     335     14                               361   
244264       3012     359     12                               150   
308606       2840       8     10                               742   
516144       2838      51     16                               180   

        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
513506                              50                

In [69]:
#Dividing dataset into X and y
columns = df.columns
X = df.filter(items = columns[0:-1])
y = df.filter(items = columns[-1:])
print(X.head)
print(y.head)

<bound method NDFrame.head of         Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
513506       3232      75     23                               210   
460481       3035     259     20                               108   
15880        2601      27      5                               234   
488723       3210     205      4                               362   
139428       3175     220      4                               624   
...           ...     ...    ...                               ...   
386052       3312     210     11                               108   
130832       3062     335     14                               361   
244264       3012     359     12                               150   
308606       2840       8     10                               742   
516144       2838      51     16                               180   

        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
513506                              50                

In [81]:
#Dividing X into X_train and X_test 
#and dividing y into y_train and y_test
train_rows_count = int(df.shape[0] * 0.7)
test_rows_count = df.shape[0] - train_rows_count
X_train = X.head(train_rows_count)
X_test = X.tail(test_rows_count)
y_train = y.head(train_rows_count)
y_test = y.tail(test_rows_count)
print("X_train:", X_train.head)
print("y_train:", y_train.head)
print("X_test:", X_test.head)
print("y_test:", y_test.head)

X_train: <bound method NDFrame.head of         Elevation    Aspect     Slope  Horizontal_Distance_To_Hydrology  \
513506   0.686843  0.208333  0.348485                          0.150322   
460481   0.588294  0.719444  0.303030                          0.077309   
15880    0.371186  0.075000  0.075758                          0.167502   
488723   0.675838  0.569444  0.060606                          0.259127   
139428   0.658329  0.611111  0.060606                          0.446671   
...           ...       ...       ...                               ...   
373707   0.650825  0.069444  0.257576                          0.768074   
99394    0.398699  0.016667  0.136364                          0.030064   
158144   0.509255  0.430556  0.106061                          0.068003   
570625   0.374687  0.000000  0.348485                          0.274875   
198830   0.531766  0.163889  0.181818                          0.042949   

        Vertical_Distance_To_Hydrology  Horizontal_Distance_

In [71]:
#Cheking data for preprocessing
#First checking number of missing values
print("Number of missing values in total:", df.isnull().sum().sum())

Number of missing values in total: 0


In [72]:
#Second watching how our data varies for scaling
for column in columns[:-1]:
    print(column, "Max:", X[column].max(), "Min:", X[column].min())

Elevation Max: 3858 Min: 1859
Aspect Max: 360 Min: 0
Slope Max: 66 Min: 0
Horizontal_Distance_To_Hydrology Max: 1397 Min: 0
Vertical_Distance_To_Hydrology Max: 601 Min: -173
Horizontal_Distance_To_Roadways Max: 7117 Min: 0
Hillshade_9am Max: 254 Min: 0
Hillshade_Noon Max: 254 Min: 0
Hillshade_3pm Max: 254 Min: 0
Horizontal_Distance_To_Fire_Points Max: 7173 Min: 0
Wilderness_Area1 Max: 1 Min: 0
Wilderness_Area2 Max: 1 Min: 0
Wilderness_Area3 Max: 1 Min: 0
Wilderness_Area4 Max: 1 Min: 0
Soil_Type1 Max: 1 Min: 0
Soil_Type2 Max: 1 Min: 0
Soil_Type3 Max: 1 Min: 0
Soil_Type4 Max: 1 Min: 0
Soil_Type5 Max: 1 Min: 0
Soil_Type6 Max: 1 Min: 0
Soil_Type7 Max: 1 Min: 0
Soil_Type8 Max: 1 Min: 0
Soil_Type9 Max: 1 Min: 0
Soil_Type10 Max: 1 Min: 0
Soil_Type11 Max: 1 Min: 0
Soil_Type12 Max: 1 Min: 0
Soil_Type13 Max: 1 Min: 0
Soil_Type14 Max: 1 Min: 0
Soil_Type15 Max: 1 Min: 0
Soil_Type16 Max: 1 Min: 0
Soil_Type17 Max: 1 Min: 0
Soil_Type18 Max: 1 Min: 0
Soil_Type19 Max: 1 Min: 0
Soil_Type20 Max: 1 Min: 0

In [73]:
#Because some of them are binary but some of them vary
#much more, we will normalize all of them
#we use MinMax scaling and becuase we can't use it independently in X_train and X_test
#we do it on X and divide it again
scaler = MinMaxScaler() 
X.iloc[:,:] = scaler.fit_transform(X.iloc[:,:].to_numpy())
X_train = X.head(train_rows_count)
X_test = X.tail(test_rows_count)

#now we check how our data varies now
for column in columns[:-1]:
    print(column, "Max:", X[column].max(), "Min:", X[column].min())

  X.iloc[:,:] = scaler.fit_transform(X.iloc[:,:].to_numpy())


Elevation Max: 0.9999999999999999 Min: 0.0
Aspect Max: 1.0 Min: 0.0
Slope Max: 1.0 Min: 0.0
Horizontal_Distance_To_Hydrology Max: 1.0 Min: 0.0
Vertical_Distance_To_Hydrology Max: 1.0 Min: 0.0
Horizontal_Distance_To_Roadways Max: 1.0 Min: 0.0
Hillshade_9am Max: 1.0 Min: 0.0
Hillshade_Noon Max: 1.0 Min: 0.0
Hillshade_3pm Max: 1.0 Min: 0.0
Horizontal_Distance_To_Fire_Points Max: 1.0 Min: 0.0
Wilderness_Area1 Max: 1.0 Min: 0.0
Wilderness_Area2 Max: 1.0 Min: 0.0
Wilderness_Area3 Max: 1.0 Min: 0.0
Wilderness_Area4 Max: 1.0 Min: 0.0
Soil_Type1 Max: 1.0 Min: 0.0
Soil_Type2 Max: 1.0 Min: 0.0
Soil_Type3 Max: 1.0 Min: 0.0
Soil_Type4 Max: 1.0 Min: 0.0
Soil_Type5 Max: 1.0 Min: 0.0
Soil_Type6 Max: 1.0 Min: 0.0
Soil_Type7 Max: 1.0 Min: 0.0
Soil_Type8 Max: 1.0 Min: 0.0
Soil_Type9 Max: 1.0 Min: 0.0
Soil_Type10 Max: 1.0 Min: 0.0
Soil_Type11 Max: 1.0 Min: 0.0
Soil_Type12 Max: 1.0 Min: 0.0
Soil_Type13 Max: 1.0 Min: 0.0
Soil_Type14 Max: 1.0 Min: 0.0
Soil_Type15 Max: 1.0 Min: 0.0
Soil_Type16 Max: 1.0 Min: 0

In [74]:
#now it's time to try diffrent parameters
#to see which parameters are best
#for our random forest model

criterions = ["gini", "entropy", "log_loss"]
n_estimators = [75, 100, 125]
max_depths = [None, 10, 20]

best_criterion = "gini"
best_n_estimators = 75
best_max_depth = None
best_score = 0

for current_criterion in criterions:
    for current_n_estimator in n_estimators:
        for current_max_depth in max_depths:
            my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                                          ('model', RandomForestClassifier(criterion = current_criterion,
                                                                           n_estimators = current_n_estimator,
                                                                           max_depth = current_max_depth,
                                                                           n_jobs = -1))
                                         ])
            
            scores = cross_validate(my_pipeline, X_train, np.array(y_train["Cover_Type"]),
                                          cv=5,
                                          scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
                                          return_train_score=True, return_estimator=True)
            accurancy_scores = scores['test_accuracy']
            precision_scores = scores['test_precision_macro']
            recall_scores = scores['test_recall_macro']
            f1_scores = scores['test_f1_macro']

            print("criterion =", current_criterion,
                  "n_estimators =", current_n_estimator,
                  "max_depth =", current_max_depth)
            print("accurancy_scores:\n", accurancy_scores)
            print("precision_scores:\n", precision_scores)
            print("recall_scores:\n", recall_scores)
            print("f1_scores:\n", f1_scores)
            
            score = np.mean(accurancy_scores)
            print("Average accurancy score:", score)
            if(score > best_score):
                best_criterion = current_criterion
                best_n_estimators = current_n_estimator
                best_max_depth = current_max_depth
                best_score = score

print("best_score =", best_score)
print("best_criterion =", best_criterion)
print("best_n_estimators =", best_n_estimators)
print("best_max_depth =", best_max_depth)

criterion = gini n_estimators = 75 max_depth = None
accurancy_scores:
 [0.94664503 0.94584593 0.94820634 0.94661979 0.94589444]
precision_scores:
 [0.93699264 0.94066574 0.93822729 0.93729405 0.93879804]
recall_scores:
 [0.89651113 0.88287419 0.89306632 0.88687983 0.88763417]
f1_scores:
 [0.91485166 0.90841917 0.91348016 0.90970223 0.91091535]
Average accurancy score: 0.9466423065924866


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


criterion = gini n_estimators = 75 max_depth = 10
accurancy_scores:
 [0.75515724 0.75129699 0.75078065 0.74956049 0.75236351]
precision_scores:
 [0.70223768 0.85070209 0.84485893 0.70475201 0.7084852 ]
recall_scores:
 [0.49766983 0.49800156 0.48989739 0.5030011  0.50562986]
f1_scores:
 [0.53912413 0.541811   0.53465536 0.5490386  0.55025458]
Average accurancy score: 0.7518317767549334
criterion = gini n_estimators = 75 max_depth = 20
accurancy_scores:
 [0.88526223 0.88677436 0.88445084 0.89037509 0.88746143]
precision_scores:
 [0.91441429 0.91908317 0.9152439  0.91716641 0.91791538]
recall_scores:
 [0.79779441 0.79221046 0.78754159 0.79713195 0.7878342 ]
f1_scores:
 [0.83764879 0.83697784 0.8312388  0.83932189 0.83254329]
Average accurancy score: 0.8868647877763405
criterion = gini n_estimators = 100 max_depth = None
accurancy_scores:
 [0.94691549 0.94759165 0.94922672 0.94845158 0.94654602]
precision_scores:
 [0.93524889 0.94108425 0.9407588  0.93796095 0.94091837]
recall_scores:
 [0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


criterion = gini n_estimators = 100 max_depth = 10
accurancy_scores:
 [0.75084212 0.75117406 0.75076836 0.75040877 0.75056859]
precision_scores:
 [0.70784143 0.70774998 0.70388617 0.84950977 0.70681397]
recall_scores:
 [0.50333179 0.50005706 0.49605787 0.50370445 0.49283056]
f1_scores:
 [0.54872033 0.54433834 0.53608166 0.54771986 0.53634009]
Average accurancy score: 0.7507523812479823
criterion = gini n_estimators = 100 max_depth = 20
accurancy_scores:
 [0.8890733  0.8863072  0.89315483 0.88995709 0.89202247]
precision_scores:
 [0.91763167 0.91715522 0.92021515 0.9179272  0.91925036]
recall_scores:
 [0.80285942 0.78738686 0.79866049 0.79415177 0.79614605]
f1_scores:
 [0.84326216 0.83098367 0.84117544 0.83851066 0.84007761]
Average accurancy score: 0.8901029775013338
criterion = gini n_estimators = 125 max_depth = None
accurancy_scores:
 [0.94734577 0.94721054 0.94809569 0.94767706 0.94681649]
precision_scores:
 [0.93675551 0.9410978  0.93915043 0.9376355  0.9393927 ]
recall_scores:
 [

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


criterion = gini n_estimators = 125 max_depth = 10
accurancy_scores:
 [0.75214526 0.75303041 0.75332547 0.74438475 0.75027354]
precision_scores:
 [0.71273729 0.70949781 0.7041634  0.84481278 0.71189134]
recall_scores:
 [0.50273631 0.50603019 0.50198104 0.49905705 0.48821654]
f1_scores:
 [0.54581645 0.55225009 0.54373827 0.54177514 0.5309689 ]
Average accurancy score: 0.750631886745569
criterion = gini n_estimators = 125 max_depth = 20
accurancy_scores:
 [0.88808979 0.88636867 0.88581545 0.88555587 0.88522393]
precision_scores:
 [0.91480132 0.91874551 0.91622495 0.91603237 0.91572522]
recall_scores:
 [0.79740763 0.78758248 0.79034051 0.78779484 0.78730599]
f1_scores:
 [0.83806144 0.83226606 0.8327554  0.83205038 0.83275007]
Average accurancy score: 0.8862107417565428
criterion = entropy n_estimators = 75 max_depth = None
accurancy_scores:
 [0.94949718 0.94900543 0.95116914 0.94952115 0.94807047]
precision_scores:
 [0.93768134 0.94318684 0.941765   0.93687897 0.93874635]
recall_scores:
 

In [77]:
#now that we've got our best parameters
#it's time to test it on test data and see
#how it works
rf = RandomForestClassifier(criterion = best_criterion,
                            n_estimators = best_n_estimators,
                            max_depth = best_max_depth,
                            n_jobs = -1)
rf.fit(X_train, np.array(y_train["Cover_Type"]))

y_pred = rf.predict(X_train)
confusion_mat = confusion_matrix(np.array(y_train["Cover_Type"]), y_pred)
print(confusion_mat)

#time for testing
y_pred = rf.predict(X_test)
confusion_mat = confusion_matrix(np.array(y_test["Cover_Type"]), y_pred)
print(confusion_mat)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[[148306      0      0      0      0      0      0]
 [     0 198143      0      0      0      0      0]
 [     0      0  25142      0      0      0      0]
 [     0      0      0   1929      0      0      0]
 [     0      0      0      0   6702      0      0]
 [     0      0      0      0      0  12204      0]
 [     0      0      0      0      0      0  14282]]
[[59973  3372     2     0    22    15   150]
 [ 1908 82857   159     4    98   110    22]
 [    0   145 10215    50     7   195     0]
 [    0     0    90   704     0    24     0]
 [   32   564    39     0  2142    14     0]
 [    2   161   384    22     3  4591     0]
 [  309    36     0     0     0     0  5883]]
Accuracy: 0.9544531393427574


In [78]:
#now let's try it again for decision tree model
#finding best parameters
criterions = ["gini", "entropy", "log_loss"]
splitters = ["best", "random"]
max_depths = [None, 10, 20]

best_criterion = "gini"
best_splitter = "best"
best_max_depth = None
best_score = 0

for current_criterion in criterions:
    for current_splitter in splitters:
        for current_max_depth in max_depths:
            my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                                          ('model', DecisionTreeClassifier(criterion = current_criterion,
                                                                           splitter = current_splitter,
                                                                           max_depth = current_max_depth))
                                         ])
            
            scores = cross_validate(my_pipeline, X_train, np.array(y_train["Cover_Type"]),
                                          cv=5,
                                          scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
                                          return_train_score=True, return_estimator=True)
            accurancy_scores = scores['test_accuracy']
            precision_scores = scores['test_precision_macro']
            recall_scores = scores['test_recall_macro']
            f1_scores = scores['test_f1_macro']

            print("criterion =", current_criterion,
                  "splitter =", current_splitter,
                  "max_depth =", current_max_depth)
            print("accurancy_scores:\n", accurancy_scores)
            print("precision_scores:\n", precision_scores)
            print("recall_scores:\n", recall_scores)
            print("f1_scores:\n", f1_scores)
            
            score = np.mean(accurancy_scores)
            print("Average accurancy score:", score)
            if(score > best_score):
                best_criterion = current_criterion
                best_splitter = current_splitter
                best_max_depth = current_max_depth
                best_score = score

print("best_score =", best_score)
print("best_criterion =", best_criterion)
print("best_splitter =", best_splitter)
print("best_max_depth =", best_max_depth)

criterion = gini splitter = best max_depth = None
accurancy_scores:
 [0.92701188 0.92765115 0.92840107 0.92835102 0.92584306]
precision_scores:
 [0.88159831 0.89043323 0.88435838 0.88313691 0.88510367]
recall_scores:
 [0.88465455 0.88411772 0.88788656 0.8835823  0.87918956]
f1_scores:
 [0.8831004  0.88721694 0.8860978  0.88335333 0.88210588]
Average accurancy score: 0.9274516343194706
criterion = gini splitter = best max_depth = 10
accurancy_scores:
 [0.77594601 0.77701556 0.77790072 0.77328776 0.77374264]
precision_scores:
 [0.7620614  0.76976005 0.76968759 0.77646648 0.76955493]
recall_scores:
 [0.63103741 0.62843375 0.6174166  0.61766415 0.61179218]
f1_scores:
 [0.67318107 0.67075719 0.65291197 0.65867083 0.66201077]
Average accurancy score: 0.7755785376076391
criterion = gini splitter = best max_depth = 20
accurancy_scores:
 [0.90259644 0.90042045 0.90119495 0.90033317 0.89765309]
precision_scores:
 [0.87592326 0.8754746  0.86754204 0.8736955  0.86753639]
recall_scores:
 [0.8360032

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


criterion = gini splitter = random max_depth = 10
accurancy_scores:
 [0.75611615 0.72303361 0.74372403 0.72609139 0.74118833]
precision_scores:
 [0.73525244 0.61341249 0.73950106 0.6638698  0.73307791]
recall_scores:
 [0.57499021 0.56394931 0.60306545 0.56734648 0.55694313]
f1_scores:
 [0.59970392 0.58354256 0.62111644 0.58802393 0.59001481]
Average accurancy score: 0.7380307031539024
criterion = gini splitter = random max_depth = 20
accurancy_scores:
 [0.86125249 0.8551056  0.83382508 0.86498814 0.85834942]
precision_scores:
 [0.84993857 0.85816176 0.83351848 0.84106907 0.84732885]
recall_scores:
 [0.775803   0.79766582 0.74309209 0.80118555 0.79133077]
f1_scores:
 [0.80353136 0.82253813 0.77540032 0.81812768 0.81667308]
Average accurancy score: 0.8547041462898729
criterion = entropy splitter = best max_depth = None
accurancy_scores:
 [0.93181874 0.93250719 0.93352757 0.93176873 0.93205149]
precision_scores:
 [0.89223851 0.902612   0.89265008 0.88653309 0.89906072]
recall_scores:
 [0.

In [79]:
#testing best model on test data
dt = DecisionTreeClassifier(criterion = best_criterion,
                            splitter = best_splitter,
                            max_depth = best_max_depth)
dt.fit(X_train, np.array(y_train["Cover_Type"]))

y_pred = dt.predict(X_train)
confusion_mat = confusion_matrix(np.array(y_train["Cover_Type"]), y_pred)
print(confusion_mat)

y_pred = dt.predict(X_test)
confusion_mat = confusion_matrix(np.array(y_test["Cover_Type"]), y_pred)
print(confusion_mat)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[[148306      0      0      0      0      0      0]
 [     0 198143      0      0      0      0      0]
 [     0      0  25142      0      0      0      0]
 [     0      0      0   1929      0      0      0]
 [     0      0      0      0   6702      0      0]
 [     0      0      0      0      0  12204      0]
 [     0      0      0      0      0      0  14282]]
[[59681  3517     6     0    44    15   271]
 [ 3526 80891   224     2   302   167    46]
 [    3   171  9939    91    29   379     0]
 [    0     0    73   719     0    26     0]
 [   46   378    36     0  2310    21     0]
 [    9   183   393    31    13  4534     0]
 [  309    40     0     0     0     0  5879]]
Accuracy: 0.9406152469249128
