# Machine Learning Algorithms Testing

In [43]:
import os
import pandas as pd
from os import path


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix


#LR libraries
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

#KNN libraries
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

#RF libraries
# source - https://www.datacamp.com/tutorial/random-forests-classifier-python
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV

#SVM libraries
from sklearn import svm
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [44]:
path = os.getcwd()
text_features = pd.read_csv(path + "/extracted_TRAD_LEX.csv")
text_features_header = ["Word Count", "Sentence Count", "Ave. Word Length", "Ave. Sentence Length", "Total Syllables", "Noun-Token Ratio",
                        "Verb-Token Ratio", "Type-Token Ratio", "Root TTR", "Corrected TTR", "Bilogarithmic TTR", "Lexical Density",
                        "FW-Token Ratio", "Age Classification"]

text_features

Unnamed: 0,Word Count,Sentence Count,Ave. Word Length,Ave. Sentence Length,Total Syllables,Noun-Token Ratio,Verb-Token Ratio,Type-Token Ratio,Root TTR,Corrected TTR,Bilogarithmic TTR,Lexical Density,FW-Token Ratio,Age Classification
0,973,85,5.21638,13.823529,2096,0.212914,0.113438,0.009599,0.324938,0.229766,0.340415,0.403141,0.017452,10
1,937,95,5.048168,12.105263,2018,0.171946,0.103167,0.009955,0.330911,0.23399,0.342185,0.41267,0.021719,10
2,900,72,4.793478,15.097222,1844,0.136999,0.105312,0.010252,0.335809,0.237453,0.343626,0.367195,0.033551,9
3,1279,97,4.959199,16.319588,2596,0.192617,0.1,0.007383,0.28497,0.201504,0.328185,0.405369,0.015436,12
4,876,75,4.849498,14.053333,1824,0.195192,0.103846,0.010577,0.341096,0.241191,0.345171,0.388462,0.001923,9
5,1177,137,5.007531,10.883212,2491,0.178177,0.104972,0.007597,0.289074,0.204406,0.329475,0.381906,0.016575,12
6,1601,120,5.328449,15.816667,3610,0.175055,0.143326,0.006018,0.257279,0.181924,0.319252,0.407549,0.004376,9
7,1365,132,4.945166,12.393939,2896,0.191083,0.134395,0.007006,0.277615,0.196303,0.325853,0.407643,0.012102,12
8,975,92,4.995984,12.347826,2089,0.145889,0.139699,0.009726,0.327086,0.231284,0.341053,0.416446,0.002653,10
9,552,69,4.984211,10.231884,1177,0.129851,0.129851,0.016418,0.424967,0.300497,0.368494,0.374627,0.007463,9


In [45]:
X = text_features[text_features_header[:-1]].values
y = text_features[['Age Classification']].values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

[[9.75000000e+02 9.20000000e+01 4.99598394e+00 1.23478261e+01
  2.08900000e+03 1.45888594e-01 1.39699381e-01 9.72590628e-03
  3.27085568e-01 2.31284423e-01 3.41053034e-01 4.16445623e-01
  2.65251989e-03]
 [1.17700000e+03 1.37000000e+02 5.00753138e+00 1.08832117e+01
  2.49100000e+03 1.78176796e-01 1.04972376e-01 7.59668508e-03
  2.89073582e-01 2.04405890e-01 3.29474514e-01 3.81906077e-01
  1.65745856e-02]]


### Logistic Regression

In [46]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

y_pred = logreg.predict(X_test_scaled)
print("Predictions: ", y_pred)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.5
Confusion Matrix:
[[0 0 0]
 [1 0 0]
 [0 0 1]]


### K-Nearest Neighbors

In [47]:
knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print("Predictions: ", y_pred)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 1.0
Confusion Matrix:
[[1 0]
 [0 1]]


### Random Forest

In [48]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print("Predictions: ", y_pred)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy)
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 1.0
Confusion Matrix:
[[1 0]
 [0 1]]


### Support Vector Machine

In [49]:
clf = svm.SVC(kernel='linear')

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Predictions: ", y_pred)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy)
print(f'Confusion Matrix:\n{conf_matrix}')

[10 12]
Accuracy: 1.0
Confusion Matrix:
[[1 0]
 [0 1]]
