### Loading Dataset

In [142]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
  
# fetch dataset 
covertype = fetch_ucirepo(id=31) 

In [143]:
# data (as pandas dataframes) 
X = covertype.data.features 
Y = covertype.data.targets 

### Creating Dataframe

In [144]:
df = pd.concat([X, Y], axis = 1)
df = df.sample(n = 50000, random_state = 42)

In [145]:
df['Cover_Type'].value_counts()

Cover_Type
2    24299
1    18401
3     3082
7     1737
6     1463
5      796
4      222
Name: count, dtype: int64

### Feature Scaling

In [146]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler

def stan_scaler():
    scaler = StandardScaler()
    for feature in df:
        if feature == 'Cover_Type':
            continue
        else:
            if ((df[feature] < 0).any() or (df[feature] > 1).any()):
                df[feature] = scaler.fit_transform(df[[feature]])
                df[feature] = scaler.fit_transform(df[[feature]])

def mm_scaler():
    scaler = MinMaxScaler()
    for feature in df:
        if feature == 'Cover_Type':
            continue
        else:
            if ((df[feature] < 0).any() or (df[feature] > 1).any()):
                df[feature] = scaler.fit_transform(df[[feature]])
                df[feature] = scaler.fit_transform(df[[feature]])

def abs_scaler():
    scaler = MaxAbsScaler()
    for feature in df:
        if feature == 'Cover_Type':
            continue
        else:
            if ((df[feature] < 0).any() or (df[feature] > 1).any()):
                df[feature] = scaler.fit_transform(df[[feature]])
                df[feature] = scaler.fit_transform(df[[feature]])

In [147]:
mm_scaler()

### Splitting Positive-Negative Labels

In [148]:
df['Cover_Type'] = df['Cover_Type'].isin([1,2]).astype(int)

### Finding correlated features with correlation matrix

In [149]:
corr_mtx = df.corr()
print(f'Most Positively Correlated Features: {corr_mtx['Cover_Type'][corr_mtx['Cover_Type'] > 0.10].index.tolist()}')
print(f'Most Negatively Correlated Features: {corr_mtx['Cover_Type'][corr_mtx['Cover_Type'] < -0.10].index.tolist()}')

Most Positively Correlated Features: ['Elevation', 'Horizontal_Distance_To_Roadways', 'Hillshade_Noon', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Soil_Type23', 'Soil_Type29', 'Soil_Type32', 'Cover_Type']
Most Negatively Correlated Features: ['Slope', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type10', 'Soil_Type17', 'Soil_Type38', 'Soil_Type39', 'Wilderness_Area4']


In [150]:
X = df.drop('Cover_Type', axis = 1)
Y = df['Cover_Type']

In [151]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.25, random_state = 42)

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score


models = [SVC(kernel = 'linear', random_state = 42), LogisticRegression(random_state = 42), RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 42)]

for model in models:
    model.fit(X_train, Y_train)

    Y_train_hat = model.predict(X_train)
    Y_val_hat = model.predict(X_val)
    Y_test_hat = model.predict(X_test)

    train_acc = accuracy_score(Y_train, Y_train_hat)
    val_acc = accuracy_score(Y_val, Y_val_hat)
    test_acc = accuracy_score(Y_test, Y_test_hat)

    print(f'''Model: {model}\n Training Accuracy: {train_acc}\n Validation Accuracy: {val_acc}\n 
          Testing Accuracy: {test_acc}\n Average: {(train_acc + val_acc + test_acc) / 3}''')
    
    precision = precision_score(Y_test, Y_test_hat)
    recall = recall_score(Y_test, Y_test_hat)
    f1Score = 2 * precision * recall / (precision + recall)

    print(f' \n FOR TESTING ONLY: \n Precision: {precision}\n Recall: {recall}\n f1Score: {f1Score}\n')

Model: SVC(kernel='linear', random_state=42)
 Training Accuracy: 0.9307333333333333
 Validation Accuracy: 0.9352
 Testing Accuracy: 0.9316
 Average: 0.9325111111111112
 
 FOR TESTING ONLY: 
 Precision: 0.9466484268125855
 Recall: 0.9746478873239437
 f1Score: 0.9604441360166551

Model: LogisticRegression(random_state=42)
 Training Accuracy: 0.9313
 Validation Accuracy: 0.9371
 Testing Accuracy: 0.9334
 Average: 0.9339333333333334
 
 FOR TESTING ONLY: 
 Precision: 0.9480830670926518
 Recall: 0.9752347417840376
 f1Score: 0.9614672529507058

Model: RandomForestClassifier(max_depth=5, random_state=42)
 Training Accuracy: 0.9189666666666667
 Validation Accuracy: 0.9222
 Testing Accuracy: 0.9183
 Average: 0.9198222222222222
 
 FOR TESTING ONLY: 
 Precision: 0.9171450232860392
 Recall: 0.9938967136150235
 f1Score: 0.9539796090801554

