### MACHINE LEARNING MODEL SELECTION

* Author: Ali Baran Tasdemir

In [2]:
%matplotlib inline
import numpy as np
from sklearn import preprocessing, neighbors
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# Read data files
df = pd.read_csv('features_final.csv').drop(columns='Unnamed: 0')
model_test = pd.read_csv('ENZYMES_g300_final.csv').drop(columns='Unnamed: 0')

In [4]:
# Prepare data to train. 
# X: Independent values
# y: Target values (Categories)
def prepareData(df, normalize=False):
    X = np.array(df.drop(['Name', 'Categories'], 1))
    if normalize:
        return (X - X.mean()) / (X.max() - X.min()), np.array(df['Categories'])
    return X, np.array(df['Categories'])

In [5]:
# Split data with a ratio to perform validation
def trainTestSplit(X, y, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    return X_train, X_test, y_train, y_test

In [6]:
# Train a model and return the validation accuracy with trained model
def trainAndScore(clf, X_train, y_train, X_test, y_test, out=True):
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    if out:
        print(score)
    return clf, score

In [55]:
def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0:
        return v
    return v / norm

In [47]:
# Prepare the real graph for prediction with our trained models.
# The graph used here is ENZYMES-g300 graph.
# http://networkrepository.com/ENZYMES-g300.php
test_measure = model_test.drop(columns=['Name'])
test_graph = np.array(test_measure.values)
test_graph_normalized = normalize(test_graph)

# 1. K Neighbors Classifier (without Normalizing)

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [7]:
X, y = prepareData(df)

In [8]:
X_train, X_test, y_train, y_test = trainTestSplit(X, y, test_size=0.2)

In [9]:
clf_kn = neighbors.KNeighborsClassifier()

In [10]:
clf_kn, _ = trainAndScore(clf_kn, X_train, y_train, X_test, y_test)

0.9125


In [59]:
# Make prediction with real graph
prediction = clf_kn.predict(test_graph)
print(prediction)

['PA']


### With Normalized Features

In [14]:
X, y = prepareData(df, normalize=True)

In [15]:
X_train, X_test, y_train, y_test = trainTestSplit(X, y, test_size=0.2)

In [16]:
clf_kn_norm = neighbors.KNeighborsClassifier()

In [17]:
clf_kn_norm, _ = trainAndScore(clf_kn_norm, X_train, y_train, X_test, y_test)

0.95


In [60]:
# Make prediction with real graph
prediction = clf_kn_norm.predict(test_graph_normalized)
print(prediction)

['PA']


# 2. SVM (Without Normalizing)

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [18]:
from sklearn import svm

In [19]:
X, y = prepareData(df)

In [20]:
X_train, X_test, y_train, y_test = trainTestSplit(X, y, test_size=0.2)

In [21]:
clf_svm = svm.SVC()

In [22]:
clf_svm, _ = trainAndScore(clf_svm, X_train, y_train, X_test, y_test)

0.2125




In [61]:
# Make prediction with real graph
prediction = clf_svm.predict(test_graph)
print(prediction)

['CNFG']


### With Normalized Features

In [23]:
X, y = prepareData(df, normalize=True)

In [24]:
X_train, X_test, y_train, y_test = trainTestSplit(X, y, test_size=0.2)

In [25]:
clf_svm_norm = svm.SVC()

In [26]:
clf_svm_norm, _ = trainAndScore(clf_svm_norm, X_train, y_train, X_test, y_test)

0.1625




In [62]:
# Make prediction with real graph
prediction = clf_svm_norm.predict(test_graph_normalized)
print(prediction)

['CNFG']


# 3.1. SVM FROM LINEAR MODEL (WITHOUT NORMALIZE - L1 LOSS)

https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

In [27]:
from sklearn import linear_model

In [28]:
X, y = prepareData(df)
X_train, X_test, y_train, y_test = trainTestSplit(X, y, test_size=0.2)

In [29]:
clf_svm_l1 = linear_model.SGDClassifier(penalty='l1')

In [30]:
clf_svm_l1, _ = trainAndScore(clf_svm_l1, X_train, y_train, X_test, y_test)

0.75


In [63]:
# Make prediction with real graph
prediction = clf_svm_l1.predict(test_graph)
print(prediction)

['PA']


### With Normalized Features

In [31]:
X, y = prepareData(df, normalize=True)
X_train, X_test, y_train, y_test = trainTestSplit(X, y, test_size=0.2)

In [32]:
clf_norm_svm_l1 = linear_model.SGDClassifier(penalty='l1')

In [33]:
clf_norm_svm_l1, _ = trainAndScore(clf_norm_svm_l1, X_train, y_train, X_test, y_test)

0.75


In [64]:
# Make prediction with real graph
prediction = clf_norm_svm_l1.predict(test_graph_normalized)
print(prediction)

['PA']


# 3.2. SVM FROM LINEAR MODEL (WITHOUT NORMALIZE - L2 LOSS)

In [34]:
X, y = prepareData(df)
X_train, X_test, y_train, y_test = trainTestSplit(X, y, test_size=0.2)

In [35]:
clf_svm_l2 = linear_model.SGDClassifier(penalty='l2')

In [36]:
clf_svm_l2, _ = trainAndScore(clf_svm_l2, X_train, y_train, X_test, y_test)

0.7375


In [65]:
# Make prediction with real graph
prediction = clf_svm_l2.predict(test_graph)
print(prediction)

['PA']


### With Normalized Features

In [37]:
X, y = prepareData(df, normalize=True)
X_train, X_test, y_train, y_test = trainTestSplit(X, y, test_size=0.2)

In [38]:
clf_norm_svm_l2 = linear_model.SGDClassifier(penalty='l2')

In [39]:
clf_norm_svm_l2, _ = trainAndScore(clf_norm_svm_l2, X_train, y_train, X_test, y_test)

0.8125


In [66]:
# Make prediction with real graph
prediction = clf_norm_svm_l2.predict(test_graph_normalized)
print(prediction)

['PA']


# 4. RANDOM FOREST CLASSIFIER

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [40]:
from sklearn.ensemble import RandomForestClassifier

In [41]:
X, y = prepareData(df)
X_train, X_test, y_train, y_test = trainTestSplit(X, y, test_size=0.2)

In [42]:
clf_rf = RandomForestClassifier(n_estimators=100)

In [43]:
clf_rf, _ = trainAndScore(clf_rf, X_train, y_train, X_test, y_test)

0.95


In [67]:
# Make prediction with real graph
prediction = clf_rf.predict(test_graph)
print(prediction)

['PA']


### With Normalized Features

In [44]:
X, y = prepareData(df, normalize=True)
X_train, X_test, y_train, y_test = trainTestSplit(X, y, test_size=0.2)

In [45]:
clf_norm_rf = RandomForestClassifier(n_estimators=100)

In [46]:
clf_norm_rf, _ = trainAndScore(clf_norm_rf, X_train, y_train, X_test, y_test)

0.95


In [68]:
# Make prediction with real graph
prediction = clf_norm_rf.predict(test_graph_normalized)
print(prediction)

['GNP']
