# MODEL EVALUATION AND VALIDATION

    * We use 3 different data samples
    * We load the data with Panda
    * We prepare the samples to Numpy Array in order to apply
      the classifiers in Scikit Learn (sklearn)
    * We train Models using some algorithms with scikit:
        * Logistic Regression
        * Neural Networks
        * Decision Tree
        * Support Vector Machines
    * We test different Evaluation Metrics


In [1]:
# Imports for the handling of data
import pandas as pd
import numpy as np

# import for the classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Import for testing models 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Imports for Regression Metrics Evaluation
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Sample 1

### Handle Data

In [2]:
# Load data with Panda and analize it
# Use dataset '2_class_data.csv'
sample_1 = pd.read_csv("2_class_data.csv")
sample_1.head()

Unnamed: 0,x1,x2,y
0,0.78051,-0.063669,0
1,0.28774,0.29139,0
2,0.40714,0.17878,0
3,0.2923,0.4217,0
4,0.50922,0.35256,0


In [3]:
# Separate the features and the labels into arrays X_1 and y_1
X_1 = np.array(sample_1[['x1', 'x2']])
y_1 = np.array(sample_1['y'])

print("Showing X:\n", X_1[0:5])
print("\nShowing y_1\n:", y_1[0:5])

Showing X:
 [[ 0.78051  -0.063669]
 [ 0.28774   0.29139 ]
 [ 0.40714   0.17878 ]
 [ 0.2923    0.4217  ]
 [ 0.50922   0.35256 ]]

Showing y_1
: [0 0 0 0 0]


### Golden Rule: Thou shalt never use your testing data for training

In [4]:
# Split data for trining and testing
# Use a test size of 25% and a random state of 42 (standard)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.25, random_state=42)

# Models

## Logistic Regression Model

#### Training model

In [5]:
# Initiate classifier and feed the features and labels
model_1 = LogisticRegression()
model_1.fit(X_train_1, y_train_1)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Testing model

In [6]:
# Make predictions on the Test Data
y_pred_1 = model_1.predict(X_test_1)

#### Evaluating Model

In [7]:
# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_1, y_pred_1)
print("** Accuracy for the Logistic Regression Model: {}%".format(acc*100))

# Calculate some Regression Metrics to further evaluate Model

# Mean Absolute Error
error = mean_absolute_error(y_test_1, y_pred_1)
print("\n** The Mean Absolute Error is:", error)

# Mean Squared Error
error = mean_squared_error(y_test_1, y_pred_1)
print("\n** The Mean Squared Error is:", error)

# R2 Score
# The closer to one, the better the model
error = r2_score(y_test_1, y_pred_1)
print("\n** The R2 Score is:", error)

** Accuracy for the Logistic Regression Model: 84.0%

** The Mean Absolute Error is: 0.16

** The Mean Squared Error is: 0.16

** The R2 Score is: 0.35064935064935077


## Neural Networks Model

#### Training model

In [8]:
# Initiate classifier and feed the features and labels
model_1 = MLPClassifier()
model_1.fit(X_train_1, y_train_1)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

#### Testing model

In [9]:
# Make predictions on the Test Data
y_pred_1 = model_1.predict(X_test_1)

#### Evaluating Model

In [10]:
# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_1, y_pred_1)
print("** Accuracy for the Logistic Regression Model: {}%".format(acc*100))

# Calculate some Regression Metrics to further evaluate Model

# Mean Absolute Error
error = mean_absolute_error(y_test_1, y_pred_1)
print("\n** The Mean Absolute Error is:", error)

# Mean Squared Error
error = mean_squared_error(y_test_1, y_pred_1)
print("\n** The Mean Squared Error is:", error)

# R2 Score
# The closer to one, the better the model
error = r2_score(y_test_1, y_pred_1)
print("\n** The R2 Score is:", error)

** Accuracy for the Logistic Regression Model: 88.0%

** The Mean Absolute Error is: 0.12

** The Mean Squared Error is: 0.12

** The R2 Score is: 0.5129870129870131


## Decision Tree Model

#### Training model

In [11]:
# Initiate classifier and feed the features and labels
model_1 = DecisionTreeClassifier()
model_1.fit(X_train_1, y_train_1)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

#### Testing Model

In [12]:
# Make predictions on the Test Data
y_pred_1 = model_1.predict(X_test_1)

#### Evaluating Model

In [13]:
# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_1, y_pred_1)
print("** Accuracy for the Logistic Regression Model: {}%".format(acc*100))

# Calculate some Regression Metrics to further evaluate Model

# Mean Absolute Error
error = mean_absolute_error(y_test_1, y_pred_1)
print("\n** The Mean Absolute Error is:", error)

# Mean Squared Error
error = mean_squared_error(y_test_1, y_pred_1)
print("\n** The Mean Squared Error is:", error)

# R2 Score
# The closer to one, the better the model
error = r2_score(y_test_1, y_pred_1)
print("\n** The R2 Score is:", error)

** Accuracy for the Logistic Regression Model: 64.0%

** The Mean Absolute Error is: 0.36

** The Mean Squared Error is: 0.36

** The R2 Score is: -0.4610389610389607


## Support Vector Machines

#### Training model

In [14]:
# Initiate classifier and feed the features and labels
# Use parameters in the SVC classifier
# Try (kernel = 'rbf', gamma = 200) or (kernel = 'poly', gamma = 2)
model_1 = SVC(kernel = 'poly', gamma = 2)
model_1.fit(X_train_1, y_train_1)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=2, kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Testing Model

In [15]:
# Make predictions on the Test Data
y_pred_1 = model_1.predict(X_test_1)

#### Evaluating Model

In [16]:
# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_1, y_pred_1)
print("** Accuracy for the Logistic Regression Model: {}%".format(acc*100))

# Calculate some Regression Metrics to further evaluate Model

# Mean Absolute Error
error = mean_absolute_error(y_test_1, y_pred_1)
print("\n** The Mean Absolute Error is:", error)

# Mean Squared Error
error = mean_squared_error(y_test_1, y_pred_1)
print("\n** The Mean Squared Error is:", error)

# R2 Score
# The closer to one, the better the model
error = r2_score(y_test_1, y_pred_1)
print("\n** The R2 Score is:", error)

** Accuracy for the Logistic Regression Model: 92.0%

** The Mean Absolute Error is: 0.08

** The Mean Squared Error is: 0.08

** The R2 Score is: 0.6753246753246753


# Sample 2

In [17]:
# Load data with Panda and analize it
# Use dataset '2_class_data.csv'
sample_2 = pd.read_csv("data1.csv")
sample_2.head()

Unnamed: 0,x1,x2,y
0,0.336494,-0.985951,0.0
1,-0.011043,-0.105529,1.0
2,0.23816,-0.617417,1.0
3,-0.366783,-0.713819,1.0
4,1.221923,-1.039399,0.0


In [18]:
# Separate the features and the labels into arrays X_1 and y_1
X_2 = np.array(sample_2[['x1', 'x2']])
y_2 = np.array(sample_2['y'])

print("Showing X:\n", X_2[0:5])
print("\nShowing y_1\n:", y_2[0:5])

Showing X:
 [[ 0.33649358 -0.98595099]
 [-0.01104253 -0.10552856]
 [ 0.23815951 -0.61741666]
 [-0.36678288 -0.71381872]
 [ 1.22192307 -1.03939899]]

Showing y_1
: [0. 1. 1. 1. 0.]


In [19]:
# Split data for trining and testing
# Use a test size of 30% and a random state of 42
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.3, random_state=42)

### Logistic Regression Model

In [20]:
# TRAINING

# Initiate classifier and feed the features and labels
model_2 = LogisticRegression()
model_2.fit(X_train_2, y_train_2)

# TESTING

# Make predictions on the Test Data
y_pred_2 = model_2.predict(X_test_2)

# EVALUATING

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_2, y_pred_2)

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_2, y_pred_2)
print("** Accuracy for the Logistic Regression Model: {}%".format(acc*100))

# Calculate some Regression Metrics to further evaluate Model

# Mean Absolute Error
error = mean_absolute_error(y_test_2, y_pred_2)
print("\n** The Mean Absolute Error is:", error)

# Mean Squared Error
error = mean_squared_error(y_test_2, y_pred_2)
print("\n** The Mean Squared Error is:", error)

# R2 Score
# The closer to one, the better the model
error = r2_score(y_test_2, y_pred_2)
print("\n** The R2 Score is:", error)

** Accuracy for the Logistic Regression Model: 56.666666666666664%

** The Mean Absolute Error is: 0.43333333333333335

** The Mean Squared Error is: 0.43333333333333335

** The R2 Score is: -0.7410714285714284


### Neural Network Model

In [21]:
# TRAINING

# Initiate classifier and feed the features and labels
model_2 = MLPClassifier()
model_2.fit(X_train_2, y_train_2)

# TESTING

# Make predictions on the Test Data
y_pred_2 = model_2.predict(X_test_2)

# EVALUATING

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_2, y_pred_2)

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_2, y_pred_2)
print("** Accuracy for the Neural Network Model: {}%".format(acc*100))

# Calculate some Regression Metrics to further evaluate Model

# Mean Absolute Error
error = mean_absolute_error(y_test_2, y_pred_2)
print("\n** The Mean Absolute Error is:", error)

# Mean Squared Error
error = mean_squared_error(y_test_2, y_pred_2)
print("\n** The Mean Squared Error is:", error)

# R2 Score
# The closer to one, the better the model
error = r2_score(y_test_2, y_pred_2)
print("\n** The R2 Score is:", error)

** Accuracy for the Neural Network Model: 93.33333333333333%

** The Mean Absolute Error is: 0.06666666666666667

** The Mean Squared Error is: 0.06666666666666667

** The R2 Score is: 0.7321428571428572




### Decision Tree Model

In [22]:
# TRAINING

# Initiate classifier and feed the features and labels
model_2 = DecisionTreeClassifier()
model_2.fit(X_train_2, y_train_2)

# TESTING

# Make predictions on the Test Data
y_pred_2 = model_2.predict(X_test_2)

# EVALUATING

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_2, y_pred_2)

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_2, y_pred_2)
print("** Accuracy for the Decision Tree Model: {}%".format(acc*100))

# Calculate some Regression Metrics to further evaluate Model

# Mean Absolute Error
error = mean_absolute_error(y_test_2, y_pred_2)
print("\n** The Mean Absolute Error is:", error)

# Mean Squared Error
error = mean_squared_error(y_test_2, y_pred_2)
print("\n** The Mean Squared Error is:", error)

# R2 Score
# The closer to one, the better the model
error = r2_score(y_test_2, y_pred_2)
print("\n** The R2 Score is:", error)

** Accuracy for the Decision Tree Model: 80.0%

** The Mean Absolute Error is: 0.2

** The Mean Squared Error is: 0.2

** The R2 Score is: 0.1964285714285715


### Support Vector Machine

In [23]:
# TRAINING

# Initiate classifier and feed the features and labels
# Try parameters in the SVC classifier
# Try (kernel = 'rbf', gamma = 200) or (kernel = 'poly', gamma = 2) or None
model_2 = SVC()
model_2.fit(X_train_2, y_train_2)

# TESTING

# Make predictions on the Test Data
y_pred_2 = model_2.predict(X_test_2)

# EVALUATING

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_2, y_pred_2)

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_2, y_pred_2)
print("** Accuracy for the Support Vector Machine Model: {}%".format(acc*100))

# Calculate some Regression Metrics to further evaluate Model

# Mean Absolute Error
error = mean_absolute_error(y_test_2, y_pred_2)
print("\n** The Mean Absolute Error is:", error)

# Mean Squared Error
error = mean_squared_error(y_test_2, y_pred_2)
print("\n** The Mean Squared Error is:", error)

# R2 Score
# The closer to one, the better the model
error = r2_score(y_test_2, y_pred_2)
print("\n** The R2 Score is:", error)

** Accuracy for the Support Vector Machine Model: 96.66666666666667%

** The Mean Absolute Error is: 0.03333333333333333

** The Mean Squared Error is: 0.03333333333333333

** The R2 Score is: 0.8660714285714286


# Sample 3

In [24]:
# Load data with Panda and analize it
# Use dataset '2_class_data.csv'
sample_3 = pd.read_csv("data2.csv")
sample_3.head()

Unnamed: 0,x1,x2,y
0,0.24539,0.81725,0
1,0.21774,0.76462,0
2,0.20161,0.69737,0
3,0.20161,0.58041,0
4,0.2477,0.49561,0


In [25]:
# Separate the features and the labels into arrays X_1 and y_1
X_3 = np.array(sample_3[['x1', 'x2']])
y_3 = np.array(sample_3['y'])

print("Showing X:\n", X_3[0:5])
print("\nShowing y_1\n:", y_3[0:5])

Showing X:
 [[0.24539 0.81725]
 [0.21774 0.76462]
 [0.20161 0.69737]
 [0.20161 0.58041]
 [0.2477  0.49561]]

Showing y_1
: [0 0 0 0 0]


In [26]:
# Split data for trining and testing
# Use a test size of 25% and a random state of 42
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y_3, test_size=0.25, random_state=42)

### Logistic Regression Model

In [27]:
# TRAINING

# Initiate classifier and feed the features and labels
model_3 = LogisticRegression()
model_3.fit(X_train_3, y_train_3)

# TESTING

# Make predictions on the Test Data
y_pred_3 = model_3.predict(X_test_3)

# EVALUATING

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_3, y_pred_3)

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_3, y_pred_3)
print("** Accuracy for the Logistic Regression Model: {}%".format(acc*100))

# Calculate some Regression Metrics to further evaluate Model

# Mean Absolute Error
error = mean_absolute_error(y_test_3, y_pred_3)
print("\n** The Mean Absolute Error is:", error)

# Mean Squared Error
error = mean_squared_error(y_test_3, y_pred_3)
print("\n** The Mean Squared Error is:", error)

# R2 Score
# The closer to one, the better the model
error = r2_score(y_test_3, y_pred_3)
print("\n** The R2 Score is:", error)

** Accuracy for the Logistic Regression Model: 58.333333333333336%

** The Mean Absolute Error is: 0.4166666666666667

** The Mean Squared Error is: 0.4166666666666667

** The R2 Score is: -0.7142857142857142


### Neural Network Model

In [28]:
# TRAINING

# Initiate classifier and feed the features and labels
model_3 = MLPClassifier()
model_3.fit(X_train_3, y_train_3)

# TESTING

# Make predictions on the Test Data
y_pred_3 = model_3.predict(X_test_3)

# EVALUATING

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_3, y_pred_3)

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_3, y_pred_3)
print("** Accuracy for the Neural Network Model: {}%".format(acc*100))

# Calculate some Regression Metrics to further evaluate Model

# Mean Absolute Error
error = mean_absolute_error(y_test_3, y_pred_3)
print("\n** The Mean Absolute Error is:", error)

# Mean Squared Error
error = mean_squared_error(y_test_3, y_pred_3)
print("\n** The Mean Squared Error is:", error)

# R2 Score
# The closer to one, the better the model
error = r2_score(y_test_3, y_pred_3)
print("\n** The R2 Score is:", error)

** Accuracy for the Neural Network Model: 58.333333333333336%

** The Mean Absolute Error is: 0.4166666666666667

** The Mean Squared Error is: 0.4166666666666667

** The R2 Score is: -0.7142857142857142




### Decision Tree Model

In [29]:
# TRAINING

# Initiate classifier and feed the features and labels
model_3 = DecisionTreeClassifier()
model_3.fit(X_train_3, y_train_3)

# TESTING

# Make predictions on the Test Data
y_pred_3 = model_3.predict(X_test_3)

# EVALUATING

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_3, y_pred_3)

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_3, y_pred_3)
print("** Accuracy for the Decision Tree Model: {}%".format(acc*100))

# Calculate some Regression Metrics to further evaluate Model

# Mean Absolute Error
error = mean_absolute_error(y_test_3, y_pred_3)
print("\n** The Mean Absolute Error is:", error)

# Mean Squared Error
error = mean_squared_error(y_test_3, y_pred_3)
print("\n** The Mean Squared Error is:", error)

# R2 Score
# The closer to one, the better the model
error = r2_score(y_test_3, y_pred_3)
print("\n** The R2 Score is:", error)

** Accuracy for the Decision Tree Model: 95.83333333333334%

** The Mean Absolute Error is: 0.041666666666666664

** The Mean Squared Error is: 0.041666666666666664

** The R2 Score is: 0.8285714285714286


### Support Vector Machines Model

In [30]:
# TRAINING

# Initiate classifier and feed the features and labels
# Try parameters in the SVC classifier
# Try (kernel = 'rbf', gamma = 200) or (kernel = 'poly', gamma = 2) or None
model_3 = SVC()
model_3.fit(X_train_3, y_train_3)

# TESTING

# Make predictions on the Test Data
y_pred_3 = model_3.predict(X_test_3)

# EVALUATING

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_3, y_pred_3)

# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_3, y_pred_3)
print("** Accuracy for the Support Vector Machine Model: {}%".format(acc*100))

# Calculate some Regression Metrics to further evaluate Model

# Mean Absolute Error
error = mean_absolute_error(y_test_3, y_pred_3)
print("\n** The Mean Absolute Error is:", error)

# Mean Squared Error
error = mean_squared_error(y_test_3, y_pred_3)
print("\n** The Mean Squared Error is:", error)

# R2 Score
# The closer to one, the better the model
error = r2_score(y_test_3, y_pred_3)
print("\n** The R2 Score is:", error)

** Accuracy for the Support Vector Machine Model: 58.333333333333336%

** The Mean Absolute Error is: 0.4166666666666667

** The Mean Squared Error is: 0.4166666666666667

** The R2 Score is: -0.7142857142857142
