<h5 style="color:green"> All Models' features need to be 2D Arrays even if there is one feature </h3>
<ul>
<li> No documnetation
<li> Better OOP Design and Redundancy Omitting
<li> To be implemented :
    <ul>
        <li> Gaussian Naive Bayes
        <li> Logistic Regression
        <li> Support Vector Machine
        <li> Gradient Boosting
        <li> DBSCAN and HDBSCAN
        <li> K-Means
        <li> PCA - UMAP
        <li> Reinforcement Learning
        <li> AlphaZero
        <li> Factorization Methods
        <li> Convolutional Neural Networks
        <li> RNN + LSTM
        <li> Transformers
    </ul>
<li> Needs Better Implementations :
    <ul>
        <li> Faster BallTree / KDTree Algorithms for KNN
    </ul>
</ul>

In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier

import decision_tree
import random_forest
import linear_regression
import naive_bayes
import nearest_neighbors

from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import datasets
import pickle

import time
import matplotlib.pyplot as plt

In [2]:
def load_dataset(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

In [3]:
def compareModels(models, X, y , metric):
    for i, model in enumerate(models):
        print(f"Model {i}")
        start = time.perf_counter()
        model.fit(X[0], y[0])
        end = time.perf_counter()
        print(f"Time to fit : {end - start} s")
        print(f"Performance : {metric(model.predict(X[1]), y[1])}")


In [4]:
df = load_dataset("citiesSmall.pkl")
X = df['X']
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=19)
X_train.shape

(300, 2)

In [5]:
depth = [i for i in range(1, 24)]
accuracy = [[], []]
execution_time = [[], []]

for i in depth:
    print("==========================================")
    models = []
    models.append(DecisionTreeClassifier(criterion='gini', max_depth=i))
    models.append(decision_tree.DecisionTreeClassifier(criterion='gini', max_depth=i))

    compareModels(models, (X_train, X_test), (y_train, y_test), accuracy_score)
    

Model 0
Time to fit : 0.00178829999640584 s
Performance : 0.67
Model 1
Time to fit : 0.038663299987092614 s
Performance : 0.67
Model 0
Time to fit : 0.0006539000023622066 s
Performance : 0.75
Model 1
Time to fit : 0.05067850000341423 s
Performance : 0.75
Model 0
Time to fit : 0.0006664000102318823 s
Performance : 0.8
Model 1
Time to fit : 0.06851959999767132 s
Performance : 0.79
Model 0
Time to fit : 0.0007370999956037849 s
Performance : 0.81
Model 1
Time to fit : 0.09217530000023544 s
Performance : 0.8
Model 0
Time to fit : 0.0007800000021234155 s
Performance : 0.84
Model 1
Time to fit : 0.09844999999040738 s
Performance : 0.85
Model 0
Time to fit : 0.0008074000070337206 s
Performance : 0.83
Model 1
Time to fit : 0.10471769998548552 s
Performance : 0.82
Model 0
Time to fit : 0.0007814000127837062 s
Performance : 0.85
Model 1
Time to fit : 0.11174260001280345 s
Performance : 0.85
Model 0
Time to fit : 0.0007993000035639852 s
Performance : 0.84
Model 1
Time to fit : 0.11728199999197386 

In [6]:
models = [RandomForestClassifier(max_depth=20), random_forest.RandomForestClassifier(max_depth=20), ]
compareModels(models, (X_train, X_test), (y_train, y_test), accuracy_score)

Model 0
Time to fit : 0.11607519999961369 s
Performance : 0.88
Model 1
Time to fit : 3.556402899994282 s
Performance : 0.51


In [9]:
X, y = datasets.make_regression(n_samples=10000, n_features=10, noise=1, bias=19, random_state=4)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

models = [LinearRegression(), linear_regression.LinearRegression()]
compareModels(models, (X_train, X_test), (y_train, y_test), mean_squared_error)

# fig = plt.figure(figsize=(10, 8))
# plt.scatter(X, y, marker='o')
# plt.plot(X, model.predict(X), marker = 'o')
# plt.show()

Model 0
Time to fit : 0.004353800002718344 s
Performance : 1.0001200963464363
Model 1
Time to fit : 0.6487676999822725 s
Performance : 5780.283023414392


In [5]:
X, y = datasets.make_classification(n_samples=18000, n_features=5, n_informative=5, n_redundant=0, n_classes=2, random_state=19)

X_categorical = pd.DataFrame()
for i in range(X.shape[1]):
    categorical_feature = pd.cut(X[:, i], bins=5, labels=[j for j in range(5)])
    X_categorical[f'feature_{i}'] = categorical_feature

X = X_categorical.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=19)

models = [MultinomialNB(), naive_bayes.NaiveBayes()]
compareModels(models, (X_train, X_test), (y_train, y_test), accuracy_score)

Model 0
Time to fit : 0.005711899982998148 s
Performance : 0.7068888888888889
Model 1
Time to fit : 0.05556569999316707 s
Performance : 0.7586666666666667


In [6]:
X, y = datasets.make_classification(n_samples=1000, n_features=5, n_informative=3, n_redundant=2, n_classes=3, random_state=19)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=19)

for k in range(5, 101):
    print(f"\n{k} neighbors ============================\n")
    models = [KNeighborsClassifier(n_neighbors=k), nearest_neighbors.KNN(k)]
    compareModels(models, (X_train, X_test), (y_train, y_test), accuracy_score)

Model 0
Time to fit : 0.010806499980390072 s
Performance : 0.7833333333333333
Model 1
Time to fit : 0.0002756999747361988 s


TypeError: wrong number of positional arguments: expected 1, got 2