### Libraries

In [None]:
# Import necessary libraries and packages (dependencies)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
# Machine learning models for Classification:
# To implement kNN Classifier model, use scikit-learn and import KNeighborsClassifier from sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier
# To implement Extra Trees Classifier model, use scikit-learn and import ExtraTreesClassifier from sklearn.ensemble
from sklearn.ensemble import ExtraTreesClassifier
# To implement Gaussian Bayes Classifier model, use scikit-learn and import GaussianNB from sklearn.naive_bayes
from sklearn.naive_bayes import GaussianNB
# To implement three SVM Kernel's, use scikit-learn and import SVC from sklearn.svm
from sklearn.svm import SVC
# Validation results of the developed machine learning models
#(validation measurements, confusion matrices, decision limits in image format)
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
# from sklearn.model_selection import cross_val_score

### Exploratory Data Analysis

In [None]:
# Class balance
class_dist=barreiro_ano.groupby('detection').size()
class_label=pd.DataFrame(barreiro_ano,columns=['count'])
plt.figure(figsize=(8,6))
sns.barplot(x=class_label.index,y='count',data=class_label)

In [None]:
# But it is interesting the percentwise distribution of each class
for i,number in enumerate(class_dist):
    percent=(number/class_dist.sum())*100
    print('Detection',class_dist.index[i])
    print('%.2f'% percent,'%')

In [None]:
cont_data=barreiro_ano['value']

for i, col in enumerate(cont_data.columns):
    plt.figure(i)
    sns.distplot(cont_data[col])

In [None]:
# The above plots more or less tells about the skewness that I saw earlier.
# Let's dig down into Bivariate and Multivariate Analysis.
# Let's check for distribution with respect to our target.
# Here, First i want to check the shape of continous features with respect to the target class.
# Hence I'll use the continuous_data (cont_data) and plot a boxplot against target.
# I can also look at violinplot here, It's visually appealing.

In [None]:
barreiro_ano['detection']=barreiro_ano['detection'].astype('category')
#To convert target class into category

for i, col in enumerate(cont_data.columns):
    plt.figure(i,figsize=(8,4))
    sns.boxplot(x=barreiro_ano['detection'], y=col, data=barreiro_ano, palette="coolwarm")

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(cont_data.corr(),cmap='magma',linecolor='white',linewidths=1,annot=True)

In [None]:
g = sns.PairGrid(cont_data)
g.map(plt.scatter)

In [None]:
# This gives us the relation and its shape with respect to other features.
# Various inferences can be drwan out.
# Pairgrid plot is just awesome. And it's even more awesome when it's combined with KDE clusters.
# But for considerably heavy data, its time consuming.

### Data Modelling

In [None]:
# X = Input or independent variables
# y= Target (dependent) variable ('Cover_Type (7 types)')
X=covtype.loc[:,'date':'value']
y=covtype['detection']

In [None]:
#Splitting the data into  train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=101)

In [None]:
# To get the classes as integers, we use the LabelEncoder class from scikit-learn
le = LabelEncoder() y = le.fit_transform(dados['Cover_Type (7 types)'])
# This methodology was not implemented (used)

### kNN

In [None]:
%%time
# Setup arrays to store training and test accuracies
neighbors = np.arange(1,7)
train_accuracy =np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    # Setup a knn classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # Fit the model
    knn.fit(X_train, y_train)
    
    # Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)
    
    # Compute accuracy on the test set
    test_accuracy[i] = knn.score(X_test, y_test)

In [None]:
# Let's visualize the change in accuracies with respect to train and test data at different neighbors

In [None]:
# Generate plot
plt.figure(figsize=(10,6))
plt.title('k-NN Varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
# plt.show()

In [None]:
# Neighbor value = 5 yeilds the best result. Let's go by that for now

In [None]:
# Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=5) #Using Eucledian distance

In [None]:
# Fit the model
knn.fit(X_train,y_train)

In [None]:
# Get accuracy. Note: In case of classification algorithms score method represents accuracy.
accuracy_knn=knn.score(X_test,y_test)
print('KNN Accuracy: ',accuracy_knn)

In [None]:
# 96,61%
# KNN works great here, is doing a good work at differentiating a CoverType.

In [None]:
# These results can be improved through Cross-Validation

In [None]:
# Let's make predictions
y_pred_knn = knn.predict(X_test)

y_pred_knn

In [None]:
print('KNN (Nearest Neighbors) confusion matrix:')
print(metrics.classification_report(y_test, y_pred_knn))
print(metrics.confusion_matrix(y_test, y_pred_knn))

In [None]:
# these metrics are calculated True Positive, True
# Negative, False Positive and False Negative

In [None]:
# precision - accuracy of positive prediction
# recall - fraction of positives that were correctly identified
# f1 score - 2*(Recall*Precision)/(Recall+Precision)

### Decision Tree

In [None]:
# Setup a ExtraTreesClassifier classifier
etc = ExtraTreesClassifier()
# Fit the model
etc.fit(X_train, y_train)

# Check to see how well our model is performing on the test data
accuracy_etc = etc.score(X_test, y_test)*100
# Or accuracy_etc= accuracy_score(y_test,y_pred)

print('ETC Acurracy: ',accuracy_etc)

In [None]:
# the algorithm managed to hit 92.64% of the data

In [None]:
# Let's make predictions
y_pred_etc = etc.predict(X_test)

y_pred_etc

In [None]:
print('DT (ExtraDecisionTrees) confusion matrix:')

print(metrics.classification_report(y_test, y_pred_etc))
print(metrics.confusion_matrix(y_test, y_pred_etc))

In [None]:
# Filter some samples
y_test[400:403]

In [None]:
# See if the model can predict these 3 samples
predictions = etc.predict(X_test[400:403])

predictions

In [None]:
# the prediction was correct

### Naive Bayes - Gaussian Process (GB)

In [None]:
gb = GaussianNB()
gb.fit(X_train,y_train)

In [None]:
y_pred_gb = gb.predict(X_test)
accuracy_gb= accuracy_score(y_test,y_pred_gb)*100

print('GB Acurracy: ',accuracy_gb)

In [None]:
# to improve these results, we will change the size of the test data and the random state
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.10, random_state = 200)
gb.fit(X_train,y_train)
y_pred_gb = gb.predict(X_test)
accuracy_gb= accuracy_score(y_test,y_pred_gb)*100
accuracy_gb

In [None]:
# with 10% for test data, the above statement is verified, where the main advantage of this model is requiring a
# small number of training data samples to perform classification efficiently

In [None]:
# test_pred = gb.predict(X_test)
print('NB (GaussianNB) confusion matrix:')
print(metrics.classification_report(y_test, y_pred_gb))
print(metrics.confusion_matrix(y_test, y_pred_gb))

### SVM - Polynomial, Sigmoid and Radial Basis Function Kernel

In [None]:
# INC
X_train, X_test,y_train, y_test=train_test_split(X,y,test_size=0.3)

In [None]:
# Feature scaling
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
sc.fit(X_train)
X_train_std=sc.transform(X_train)
X_test_std=sc.transform(X_test)

#### Polynomial Kernel

In [None]:
# The Python degrees function is one of the Python Math functions used to convert the given angle from Radians to Degrees
svclassifier = SVC(kernel='poly', degree=8)
svclassifier.fit(X_train, y_train)

In [None]:
y_pred_poly= svclassifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred_poly))
print(classification_report(y_test, y_pred_poly))

In [None]:
set(y_test) - set(y_pred_poly)

In [None]:
# This means that there is no F-score to calculate for this label, and thus
# the F-score for this case is considered to be 0.0. Since you requested
# an average of the score, you must take into account that a score of 0 was
# included in the calculation, and this is why scikit-learn is showing you
# that warning.

#### Sigmoid Kernel

In [None]:
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(X_train, y_train)

In [None]:
y_pred_sigmoid = svclassifier.predict(X_test)

print(confusion_matrix(y_test, y_pred_sigmoid))
print(classification_report(y_test, y_pred_sigmoid))

#### RBF Kernel

In [None]:
svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_train, y_train)

In [None]:
y_pred_rbf = svclassifier.predict(X_test)

print(confusion_matrix(y_test, y_pred_rbf))
print(classification_report(y_test, y_pred_rbf))