# MODEL EVALUATION AND VALIDATION

    * We use 3 different data samples
    * We load the data with Panda
    * We prepare the samples to Numpy Array in order to apply
      the classifiers in Scikit Learn (sklearn)
    * We train Models using some algorithms with scikit:
        * Logistic Regression
        * Neural Networks
        * Decision Tree
        * Support Vector Machines
    * We test different Evaluation Metrics


In [1]:
# Imports for the handling of data
import pandas as pd
import numpy as np

# import for the classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Import for testing models 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Sample 1

### Handle Data

In [2]:
# Load data with Panda and analize it
# Use dataset '2_class_data.csv'
sample_1 = pd.read_csv("2_class_data.csv")
sample_1.head()

Unnamed: 0,x1,x2,y
0,0.78051,-0.063669,0
1,0.28774,0.29139,0
2,0.40714,0.17878,0
3,0.2923,0.4217,0
4,0.50922,0.35256,0


In [3]:
# Separate the features and the labels into arrays X_1 and y_1
X_1 = np.array(sample_1[['x1', 'x2']])
y_1 = np.array(sample_1['y'])

print("Showing X:\n", X_1[0:5])
print("\nShowing y_1\n:", y_1[0:5])

Showing X:
 [[ 0.78051  -0.063669]
 [ 0.28774   0.29139 ]
 [ 0.40714   0.17878 ]
 [ 0.2923    0.4217  ]
 [ 0.50922   0.35256 ]]

Showing y_1
: [0 0 0 0 0]


### Golden Rule: Thou shalt never use your testing data for training

In [4]:
# Split data for trining and testing
# Use a test size of 25% and a random state of 42 (standard)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.25, random_state=42)

# Models

### Logistic Regression Model

#### Training model

In [5]:
# Initiate classifier and feed the features and labels
model_1 = LogisticRegression()
model_1.fit(X_train_1, y_train_1)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Testing model

In [6]:
# Make predictions on the Test Data
y_pred_1 = model_1.predict(X_test_1)

In [7]:
# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_1, y_pred_1)

print("Accuracy for the Logistic Regression Model: {}%".format(acc*100))

Accuracy for the Logistic Regression Model: 84.0%


### Neural Networks Model

#### Training model

In [8]:
# Initiate classifier and feed the features and labels
model_1 = MLPClassifier()
model_1.fit(X_train_1, y_train_1)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

#### Testing model

In [9]:
# Make predictions on the Test Data
y_pred_1 = model_1.predict(X_test_1)

In [10]:
# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_1, y_pred_1)

print("Accuracy for the Neural Network Model: {}%".format(acc*100))

Accuracy for the Neural Network Model: 88.0%


### Decision Tree Model

#### Training model

In [11]:
# Initiate classifier and feed the features and labels
model_1 = DecisionTreeClassifier()
model_1.fit(X_train_1, y_train_1)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [12]:
# Make predictions on the Test Data
y_pred_1 = model_1.predict(X_test_1)

In [13]:
# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_1, y_pred_1)

print("Accuracy for the Decision Tree Model: {}%".format(acc*100))

Accuracy for the Decision Tree Model: 64.0%


### Support Vector Machines

#### Training model

In [14]:
# Initiate classifier and feed the features and labels
# Use parameters in the SVC classifier
# Try (kernel = 'rbf', gamma = 200) or (kernel = 'poly', gamma = 2)
model_1 = SVC(kernel = 'poly', gamma = 2)
model_1.fit(X_train_1, y_train_1)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=2, kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
# Make predictions on the Test Data
y_pred_1 = model_1.predict(X_test_1)

In [16]:
# Calculate the accuracy on the Test Data
acc = accuracy_score(y_test_1, y_pred_1)

print("Accuracy for the Support Vector Machines Model: {}%".format(acc*100))

Accuracy for the Support Vector Machines Model: 92.0%


# Sample 2