# Classificação:

Algoritmos: KNN, Decision Tree, Random Forest e Logistic Regression

Métricas de performance: Accuracy, Precision, Recall e F1-Score

### Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection as ms

from matplotlib import pyplot as plt



### Reading the data

In [2]:
X_train = pd.read_csv('../data/X_training_classification.csv')
X_test = pd.read_csv('../data/X_test_classification.csv')
X_val = pd.read_csv('../data/X_validation_classification.csv')

y_train  = pd.read_csv('../data/y_training_classification.csv')
y_test = pd.read_csv('../data/y_test_classification.csv')
y_val = pd.read_csv('../data/y_validation_classification.csv')


## KNN 

### Train

In [3]:
for i in range (3, 12, 2):
    # model 
    knn_classifier = KNeighborsClassifier(n_neighbors=i)
    # fit 
    knn_classifier.fit(X_train, y_train)
    # predict 
    yhat_train = knn_classifier.predict(X_train)

    f1 = f1_score( y_train, yhat_train )
    accuracy = accuracy_score(y_train, yhat_train)
    precision = precision_score(y_train, yhat_train)
    recall = recall_score(y_train, yhat_train)

    print(" Neighbors = {}".format(i))
    print( "Accuracy: {}".format( accuracy ) )
    print( "Precision: {}".format( precision ) )
    print( "Recall: {}".format( recall ) )
    print( "F1 Score: {}".format( f1 ) )


  return self._fit(X, y)


 Neighbors = 3
Accuracy: 0.8321864441839619
Precision: 0.8120079059067492
Recall: 0.7974099529082347
F1 Score: 0.8046427252733139


  return self._fit(X, y)


 Neighbors = 5
Accuracy: 0.7815624353582018
Precision: 0.7558933613500558
Recall: 0.7325633193330788
F1 Score: 0.744045503021685


  return self._fit(X, y)


 Neighbors = 7
Accuracy: 0.7563124870716403
Precision: 0.731248949403261
Recall: 0.6920898561792033
F1 Score: 0.7111307276085855


  return self._fit(X, y)


 Neighbors = 9
Accuracy: 0.7399986209749707
Precision: 0.7138144470140116
Recall: 0.6678439607992872
F1 Score: 0.6900644397685429


  return self._fit(X, y)


 Neighbors = 11
Accuracy: 0.7295594015031373
Precision: 0.7046517266461155
Recall: 0.6473208603792796
F1 Score: 0.6747707258826846


### Validation

In [4]:
for i in range (3, 12, 2):
    # model 
    knn_classifier = KNeighborsClassifier(n_neighbors=i)
    # fit 
    knn_classifier.fit(X_train, y_train)
    # predict 
    yhat_val = knn_classifier.predict(X_val)

    f1 = f1_score( y_val, yhat_val )
    accuracy = accuracy_score(y_val, yhat_val)
    precision = precision_score(y_val, yhat_val)
    recall = recall_score(y_val, yhat_val)

    print(" Neighbors = {}".format(i))
    print( "Accuracy: {}".format( accuracy ) )
    print( "Precision: {}".format( precision ) )
    print( "Recall: {}".format( recall ) )
    print( "F1 Score: {}".format( f1 ) )


  return self._fit(X, y)


 Neighbors = 3
Accuracy: 0.6762765854757231
Precision: 0.6278511404561825
Recall: 0.6212784913505086
F1 Score: 0.6245475239765645


  return self._fit(X, y)


 Neighbors = 5
Accuracy: 0.6756652401943435
Precision: 0.631775410218524
Recall: 0.6031628183235578
F1 Score: 0.6171376481312671


  return self._fit(X, y)


 Neighbors = 7
Accuracy: 0.6669777019852634
Precision: 0.6227083169407507
Recall: 0.5875714603905264
F1 Score: 0.6046298418519367


  return self._fit(X, y)


 Neighbors = 9
Accuracy: 0.6664950609736477
Precision: 0.6242395132885047
Recall: 0.5789590912465662
F1 Score: 0.6007472747582913


  return self._fit(X, y)


 Neighbors = 11
Accuracy: 0.6644357926574214
Precision: 0.6234768480909829
Recall: 0.5698270101715049
F1 Score: 0.5954459055820629


### Test

In [5]:
best_k = 3 

knn_classifier = KNeighborsClassifier(n_neighbors=best_k)
knn_classifier.fit(X_train, y_train)


yhat_test = knn_classifier.predict(X_test)

f1 = f1_score( y_test, yhat_test )
accuracy = accuracy_score(y_test, yhat_test)
precision = precision_score(y_test, yhat_test)
recall = recall_score(y_test, yhat_test)

print( "Accuracy: {}".format( accuracy ) )
print( "Precision: {}".format( precision ) )
print( "Recall: {}".format( recall ) )
print( "F1 Score: {}".format( f1 ) )

  return self._fit(X, y)


Accuracy: 0.6722280152937087
Precision: 0.6304623753399818
Recall: 0.6118785745710514
F1 Score: 0.6210314802411252


In [6]:
# Concatenate train and validation data 
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0)

best_k = 3 

knn_classifier = KNeighborsClassifier(n_neighbors=best_k)
knn_classifier.fit(X, y)


yhat_test = knn_classifier.predict(X_test)

f1 = f1_score( y_test, yhat_test )
accuracy = accuracy_score(y_test, yhat_test)
precision = precision_score(y_test, yhat_test)
recall = recall_score(y_test, yhat_test)

print( "Accuracy: {}".format( accuracy ) )
print( "Precision: {}".format( precision ) )
print( "Recall: {}".format( recall ) )
print( "F1 Score: {}".format( f1 ) )

  return self._fit(X, y)


Accuracy: 0.6884486154559147
Precision: 0.6480251346499102
Recall: 0.6351957765068191
F1 Score: 0.6415463230393246


# Decision Tree

### Train

In [7]:
acuracy_list = list()
precision_list = list()
recall_list = list() 
f1_list = list()

In [8]:
for i in range (2, 50):
    tree_classifier = DecisionTreeClassifier(max_depth=i)

    # fit
    tree_classifier.fit(X_train, y_train)
    # predict
    yhat_train = tree_classifier.predict(X_train)

    f1 = f1_score( y_train, yhat_train )
    accuracy = accuracy_score(y_train, yhat_train)
    precision = precision_score(y_train, yhat_train)
    recall = recall_score(y_train, yhat_train)

    print(" Max Depth = {}".format(i))
    print( "Accuracy: {}".format( accuracy ) )
    print( "Precision: {}".format( precision ) )
    print( "Recall: {}".format( recall ) )
    print( "F1 Score: {}".format( f1 ) )

    f1_list.append(f1)
    acuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)

 Max Depth = 2
Accuracy: 0.8570916362131973
Precision: 0.8297485989793683
Recall: 0.8432926053200968
F1 Score: 0.8364657797976929
 Max Depth = 3
Accuracy: 0.8841618975384403
Precision: 0.8395058088105207
Recall: 0.9059119256713758
F1 Score: 0.8714456245600073
 Max Depth = 4
Accuracy: 0.8934427359856582
Precision: 0.9233077335238435
Recall: 0.8224513172966781
F1 Score: 0.8699661747134948
 Max Depth = 5
Accuracy: 0.9067641177687376
Precision: 0.9077324870243645
Recall: 0.8736795214458445
F1 Score: 0.890380530829969
 Max Depth = 6
Accuracy: 0.9228159691098393
Precision: 0.9018637792090607
Recall: 0.9222667684866998
F1 Score: 0.9119511696320418
 Max Depth = 7
Accuracy: 0.9344825208577535
Precision: 0.9526589065734551
Recall: 0.8932162402952781
F1 Score: 0.9219804581656951
 Max Depth = 8
Accuracy: 0.9427015100324071
Precision: 0.9503318912849642
Recall: 0.9156484663357516
F1 Score: 0.9326678442366592
 Max Depth = 9
Accuracy: 0.9494863131765842
Precision: 0.9615653156897297
Recall: 0.9202303

### Validation

In [9]:
for i in range (2, 50):
    tree_classifier = DecisionTreeClassifier(max_depth=i)

    # fit
    tree_classifier.fit(X_train, y_train)
    # predict
    yhat_val = tree_classifier.predict(X_val)

    f1 = f1_score( y_val, yhat_val )
    accuracy = accuracy_score(y_val, yhat_val)
    precision = precision_score(y_val, yhat_val)
    recall = recall_score(y_val, yhat_val)

    print(" Max Depth = {}".format(i))
    print( "Accuracy: {}".format( accuracy ) )
    print( "Precision: {}".format( precision ) )
    print( "Recall: {}".format( recall ) )
    print( "F1 Score: {}".format( f1 ) )

    f1_list.append(f1)
    acuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)

 Max Depth = 2
Accuracy: 0.8564947392129734
Precision: 0.8300241775954282
Recall: 0.8411166382062514
F1 Score: 0.8355335939228556
 Max Depth = 3
Accuracy: 0.885453199909907
Precision: 0.8403984884919272
Recall: 0.9081594773182864
F1 Score: 0.8729660291178989
 Max Depth = 4
Accuracy: 0.8934650407027254
Precision: 0.9241690329046267
Recall: 0.8215903185091692
F1 Score: 0.8698659749243407
 Max Depth = 5
Accuracy: 0.9066894044209917
Precision: 0.9065312716362797
Recall: 0.8748979137278194
F1 Score: 0.8904337312981714
 Max Depth = 6
Accuracy: 0.9225843817368641
Precision: 0.9001084990958409
Recall: 0.9238993243744895
F1 Score: 0.9118487579687844
 Max Depth = 7
Accuracy: 0.933685125004022
Precision: 0.9526265672115537
Recall: 0.8913059618382954
F1 Score: 0.9209466456982854
 Max Depth = 8
Accuracy: 0.9399272820875832
Precision: 0.9477462179685088
Recall: 0.911648971712822
F1 Score: 0.9293472090823084
 Max Depth = 9
Accuracy: 0.9443354033270054
Precision: 0.9559543230016313
Recall: 0.913653574

### Test

In [10]:
best_max_depth = 49 

tree_classifier = DecisionTreeClassifier(max_depth=best_max_depth)

tree_classifier.fit(X, y)

yhat_test = tree_classifier.predict(X_test)

f1 = f1_score( y_test, yhat_test )
accuracy = accuracy_score(y_test, yhat_test)
precision = precision_score(y_test, yhat_test)
recall = recall_score(y_test, yhat_test)

print( "Accuracy: {}".format( accuracy ) )
print( "Precision: {}".format( precision ) )
print( "Recall: {}".format( recall ) )
print( "F1 Score: {}".format( f1 ) )

Accuracy: 0.9463947785115668
Precision: 0.9368596199316928
Recall: 0.941311042674879
F1 Score: 0.9390800561797753




# Random Forest 


## Train

In [None]:
for i in range (3, 50):
    rd_classifier = RandomForestClassifier(max_depth=i)

    rd_classifier.fit(X_train, y_train)

    yhat_train = rd_classifier.predict(X_train)

    f1 = f1_score( y_train, yhat_train )
    accuracy = accuracy_score(y_train, yhat_train)
    precision = precision_score(y_train, yhat_train)
    recall = recall_score(y_train, yhat_train)

    print(" Max Depth = {}".format(i))
    print( "Accuracy: {}".format( accuracy ) )
    print( "Precision: {}".format( precision ) )
    print( "Recall: {}".format( recall ) )
    print( "F1 Score: {}".format( f1 ) )

## Validation

In [None]:
for i in range (10, 50):
    rd_classifier = RandomForestClassifier(max_depth=i)

    rd_classifier.fit(X_train, y_train)

    yhat_val = rd_classifier.predict(X_val)

    f1 = f1_score( y_val, yhat_val )
    accuracy = accuracy_score(y_val, yhat_val)
    precision = precision_score(y_val, yhat_val)
    recall = recall_score(y_val, yhat_val)

    print(" Max Depth = {}".format(i))
    print( "Accuracy: {}".format( accuracy ) )
    print( "Precision: {}".format( precision ) )
    print( "Recall: {}".format( recall ) )
    print( "F1 Score: {}".format( f1 ) )

## Test

In [40]:
best_max_depth = 23

rd_classifier = RandomForestClassifier(max_depth=best_max_depth)

rd_classifier.fit(X, y)

yhat_test = rd_classifier.predict(X_test)

f1 = f1_score( y_test, yhat_test )
accuracy = accuracy_score(y_test, yhat_test)
precision = precision_score(y_test, yhat_test)
recall = recall_score(y_test, yhat_test)

print( "Accuracy: {}".format( accuracy ) )
print( "Precision: {}".format( precision ) )
print( "Recall: {}".format( recall ) )
print( "F1 Score: {}".format( f1 ) )

  return fit_method(estimator, *args, **kwargs)


Accuracy: 0.965009848221527
Precision: 0.9738153483736522
Recall: 0.9457105147382314
F1 Score: 0.9595571823944291


# Logistic Regression


In [7]:
model = LogisticRegression()

model.fit(X_train, y_train)

yhat = model.predict(X_train)

f1 = f1_score( y_train, yhat )
accuracy = accuracy_score(y_train, yhat)
precision = precision_score(y_train, yhat)
recall = recall_score(y_train, yhat)

print( "Accuracy: {}".format( accuracy ) )
print( "Precision: {}".format( precision ) )
print( "Recall: {}".format( recall ) )
print( "F1 Score: {}".format( f1 ) )

  y = column_or_1d(y, warn=True)


Accuracy: 0.8726056677928704
Precision: 0.8609538681761988
Recall: 0.8420516736667939
F1 Score: 0.8513978702184474


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
model = LogisticRegression()

model.fit(X_train, y_train)

yhat = model.predict(X_val)

f1 = f1_score( y_val, yhat )
accuracy = accuracy_score(y_val, yhat)
precision = precision_score(y_val, yhat)
recall = recall_score(y_val, yhat)

print( "Accuracy: {}".format( accuracy ) )
print( "Precision: {}".format( precision ) )
print( "Recall: {}".format( recall ) )
print( "F1 Score: {}".format( f1 ) )

  y = column_or_1d(y, warn=True)


Accuracy: 0.869976511470768
Precision: 0.8597923981071592
Recall: 0.8363649862647561
F1 Score: 0.8479169018855143


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
model = LogisticRegression()

model.fit(X_train, y_train)

yhat = model.predict(X_test)

f1 = f1_score( y_test, yhat )
accuracy = accuracy_score(y_test, yhat)
precision = precision_score(y_test, yhat)
recall = recall_score(y_test, yhat)

print( "Accuracy: {}".format( accuracy ) )
print( "Precision: {}".format( precision ) )
print( "Recall: {}".format( recall ) )
print( "F1 Score: {}".format( f1 ) )

  y = column_or_1d(y, warn=True)


Accuracy: 0.8687676205924381
Precision: 0.8598139282810947
Recall: 0.8375714914210295
F1 Score: 0.8485469780709574


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
