In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from machine_learning import linear
from machine_learning import preprocessing, validation, multiclass_classifier
import pandas as pd
import numpy as np
import cufflinks as cf
from scipy import stats
cf.go_offline()

In [None]:
breast_cancer = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", 
    header=None, 
    names=[
        "id_number", 
        "clump_thickness", 
        'uniformity_cell_size', 
        'uniformity_cell_shape', 
        "marginal_adhesion",
        "single_epithelial_cell_size", 
        "bare_nuclei", 
        "bland_chromatin", "normal_nucleoli", "mitosis", "class"
    ]
).replace('?',np.NaN).astype('float', errors='ignore').dropna(how='any', axis=0)

In [None]:
breast_cancer.head()

In [None]:
breast_X, breast_y = (
    ss.fit_transform(breast_cancer.drop(['id_number', 'class'], axis=1).values),
    breast_cancer['class'].astype('category').cat.codes.values
)

In [None]:
from collections import Counter

In [None]:
Counter(breast_y).most_common(1)[0][0]

In [None]:
kfold = validation.KFoldStratifiedCV(num_folds=5)
accuracy_adaline = []
accuracy_lr = []
baseline = []
for train, test in kfold.split(X=breast_X, y=breast_y):
    sweet_adaline = linear.AdalineNetwork(
        convergence_tolerance=.0001, 
        fit_intercept=True,
        max_iter=10000, 
        learning_rate=.0001
    )
    
    logistic_regression = linear.LogisticRegressionClassifier(
        convergence_tolerance=.0001, 
        fit_intercept=True,
        max_iter=10000, 
        learning_rate=.0001    
    )
    
    baseline.append(np.mean(stats.mode(breast_y[train]).mode[0]  == breast_y[test]))

    sweet_adaline.fit(breast_X[train], breast_y[train])
    logistic_regression.fit(breast_X[train], breast_y[train])
    accuracy_adaline.append(np.mean(sweet_adaline.predict(breast_X[test]) == breast_y[test]))
    accuracy_lr.append(np.mean(logistic_regression.predict(breast_X[test]) == breast_y[test]))
    

In [None]:
.linspace(-10, 10, .05)

In [734]:
list(map(np.mean, (accuracy_adaline, accuracy_lr, baseline)))

[0.954506623411733, 0.9648121113814545, 0.6500783995674506]

Soybean Data

In [815]:
from toolz import pipe


# Next, we repeat this process on the Soybean data
soybean_data = pipe(
    pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-small.data",
        header=None,
        names=[
            "date",
            "plant-stand",
            "precip",
            "temp",
            "hail",
            "crop-hist",
            "area-damaged",
            "severity",
            "seed-tmt",
            "germination",
            "plant-growth",
            "leaves",
            "leafspots-halo",
            "leafspots-marg",
            "leafspot-size",
            "leaf-shread",
            "leaf-malf",
            "leaf-mild",
            "stem",
            "lodging",
            "stem-cankers",
            "canker-lesion",
            "fruiting-bodies",
            "external decay",
            "mycelium",
            "int-discolor",
            "sclerotia",
            "fruit-pods",
            "fruit spots",
            "seed",
            "mold-growth",
            "seed-discolor",
            "seed-size",
            "shriveling",
            "roots",
            "instance_class",
        ],
    )
    .pipe(lambda df: df.loc(axis=1)[df.nunique() > 1])  # drop columns with no variance
    .assign(instance_class=lambda df: df["instance_class"].astype("category").cat.codes)
)

In [822]:
X, y = (
    pd.get_dummies(
        soybean_data.drop('instance_class', axis=1),
        columns=soybean_data.drop('instance_class', axis=1).columns, 
        drop_first=True
    ).values, 
    soybean_data['instance_class'].values
)

kfold = validation.KFoldStratifiedCV(num_folds=5)
accuracy_adaline = []
accuracy_lr = []
baseline = []
for train, test in kfold.split(X=X, y=y):
    sweet_adaline = multiclass_classifier.MulticlassClassifier(
        model_cls=lambda *args: linear.AdalineNetwork(
            convergence_tolerance=.0001, 
            fit_intercept=True,
            max_iter=1000, 
            learning_rate=.001
        ), 
        classes=np.unique(y), 
        cls_kwargs={i: {} for i in np.unique(y)}
    )
    
    
    logistic_regression = linear.LogisticRegressionClassifier(
        convergence_tolerance=.0001, 
        fit_intercept=True,
        max_iter=1000, 
        learning_rate=.001
    )
    
    sweet_adaline.fit(X[train], y[train])
    
    logistic_regression.fit(X[train], y[train])
    
    baseline.append(np.mean(stats.mode(y[train]).mode[0]  == y[test]))
    accuracy_adaline.append(np.mean(sweet_adaline.predict(X[test]) == y[test]))
    accuracy_lr.append(np.mean(logistic_regression.predict(X[test]) == y[test]))
    


In [830]:
list(map(np.mean, (baseline, accuracy_adaline, accuracy_lr)))

[0.36, 1.0, 1.0]

In [745]:
glass_data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data", 
    header=None,
    names=[
        "id_number",
        "refractive_index",
        "sodium",
        "magnesium",
        "aluminum",
        "silicon",
        "potassium",
        "calcium",
        "barium",
        "iron",
        "class",
    ],
)

In [804]:
X, y = (
    glass_data.drop(['id_number', 'class'], axis=1).values, 
    glass_data['class'].astype('category').cat.codes
)

kfold = validation.KFoldStratifiedCV(num_folds=5)
accuracy_adaline = []
accuracy_lr = []
baseline = []
for train, test in kfold.split(X=X, y=y):
    sweet_adaline = multiclass_classifier.MulticlassClassifier(
        model_cls=lambda *args: linear.AdalineNetwork(
            convergence_tolerance=.0001, 
            fit_intercept=True,
            max_iter=5000, 
            learning_rate=.005
        ), 
        classes=np.unique(y), 
        cls_kwargs={i: {} for i in np.unique(y)}
    )
    
    
    logistic_regression = linear.LogisticRegressionClassifier(
        convergence_tolerance=.0001, 
        fit_intercept=True,
        max_iter=15000, 
        learning_rate=.005
    )
    
    ms = preprocessing.MaxScaler()

    ms.fit(X[train])

    sweet_adaline.fit(ms.transform(X[train]), y[train])
    
    logistic_regression.fit(ms.transform(X[train]), y[train])
    
    baseline.append(np.mean(stats.mode(y[train]).mode[0] == y[test]))
    accuracy_adaline.append(np.mean(sweet_adaline.predict(ms.transform(X[test])) == y[test]))
    accuracy_lr.append(np.mean(logistic_regression.predict(ms.transform(X[test])) == y[test]))
    


In [805]:
list(map(np.mean, (baseline, accuracy_adaline, accuracy_lr)))

[0.35548894258196584, 0.5050456391154066, 0.44838182489345285]

In [452]:
def add_ones(X):
    return np.concatenate(
        [
            np.ones((X.shape[0], 1)),
            X
        ], axis=1
    )


lr = LogisticRegression(multi_class='ovr')
lr.fit(glass_X, glass_y)
np.mean(lr.predict(glass_X) == glass_y)





0.6962616822429907

In [9]:
iris_data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", 
    header=None, names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
)

X, y = (
    iris_data.drop(['class'], axis=1).values, 
    iris_data['class'].astype('category').cat.codes
)

kfold = validation.KFoldStratifiedCV(num_folds=5)
accuracy_adaline = []
accuracy_lr = []
baseline = []
for train, test in kfold.split(X=X, y=y):
    sweet_adaline = multiclass_classifier.MulticlassClassifier(
        model_cls=lambda *args: linear.AdalineNetwork(
            convergence_tolerance=.0001, 
            fit_intercept=True,
            max_iter=1000, 
            learning_rate=.005
        ), 
        classes=np.unique(y), 
        cls_kwargs={i: {} for i in np.unique(y)}
    )
    
    
    logistic_regression = linear.LogisticRegressionClassifier(
        convergence_tolerance=.0001, 
        fit_intercept=True,
        max_iter=1000, 
        learning_rate=.005
    )
    
    ms = preprocessing.MaxScaler()
    
    ms.fit(X[train])
    print("Fitting Adaline")
    sweet_adaline.fit(ms.transform(X[train]), y[train])
    
    print("Fitting LR")
    logistic_regression.fit(ms.transform(X[train]), y[train])
    
    pd.DataFrame(
        np.hstack(
            [
                X[test], 
                y[test].values.reshape(-1, 1),
                np.array(logistic_regression.predict(ms.transform(X[test]))).reshape(-1, 1),
            ]
        ), columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class', 'prediction']
    ).to_csv("logistic_regression_iris_predictions.csv",index=False)
    
    pd.DataFrame(
        np.hstack(
            [
                X[test], 
                y[test].values.reshape(-1, 1),
                np.array(sweet_adaline.predict(ms.transform(X[test]))).reshape(-1, 1),
            ]
        ), columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class', 'prediction']
    ).to_csv("adaline_iris_predictions.csv", index=False)


    
    baseline.append(np.mean(stats.mode(y[train]).mode[0] == y[test]))
    accuracy_adaline.append(np.mean(sweet_adaline.predict(ms.transform(X[test])) == y[test]))
    accuracy_lr.append(np.mean(logistic_regression.predict(ms.transform(X[test])) == y[test]))
    break
    

Fitting Adaline
Fitting LR


In [837]:
house_votes_data = pipe(
    pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data",
        header=None,
        names=[
            "instance_class",
            "handicapped-infants",
            "water-project-cost-sharing",
            "adoption-of-the-budget-resolution",
            "physician-fee-freeze",
            "el-salvador-aid",
            "religious-groups-in-schools",
            "anti-satellite-test-ban",
            "aid-to-nicaraguan-contras",
            "mx-missile",
            "immigration",
            "synfuels-corporation-cutback",
            "education-spending",
            "superfund-right-to-sue",
            "crime",
            "duty-free-exports",
            "export-administration-act-south-africa",
        ],
    )
    .replace("?", np.NaN)
    .replace("y", 1)
    .replace("n", 0), 
    lambda df: pd.get_dummies(df, columns=df.columns, drop_first=True, dummy_na=True)
)

In [850]:
X, y = (
    house_votes_data.drop(['instance_class_republican', 'instance_class_nan'], axis=1).values, 
    house_votes_data['instance_class_republican'].values
)

In [852]:
kfold = validation.KFoldStratifiedCV(num_folds=5)
accuracy_adaline = []
accuracy_lr = []
baseline = []
for train, test in kfold.split(X=X, y=y):
    sweet_adaline = linear.AdalineNetwork(
        convergence_tolerance=.0001, 
        fit_intercept=True,
        max_iter=1000, 
        learning_rate=.0001
    )
    
    logistic_regression = linear.LogisticRegressionClassifier(
        convergence_tolerance=.0001, 
        fit_intercept=True,
        max_iter=1000, 
        learning_rate=.0001    
    )
    
    baseline.append(np.mean(stats.mode(y[train]).mode[0]  == y[test]))

    sweet_adaline.fit(X[train], y[train])
    logistic_regression.fit(X[train], y[train])
    accuracy_adaline.append(np.mean(sweet_adaline.predict(X[test]) == y[test]))
    accuracy_lr.append(np.mean(logistic_regression.predict(X[test]) == y[test]))

In [853]:
list(map(np.mean, (baseline, accuracy_adaline, accuracy_lr)))

[0.6138052538212924, 0.953965274233919, 0.947147699934388]