# Homework 3 &mdash; Digit Classifier

### Connor Hornibrook

### Data Mining II &mdash; Dr. Breitzman

## Classifier and source data

For this script, I decided to use the ```sklearn.svm.SVC``` classifier to do predictions. All source data was pulled from [Kaggle](https://www.kaggle.com/c/digit-recognizer/data). The trained model included in this repository has an accuracy of roughly 97%.

In [1]:
# import all needed libraries, configure basic logging
import pathlib
import pandas
import pickle
import logging
import csv
import numpy
from statistics import mean
from sklearn import linear_model, svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold

logging.basicConfig(level=logging.DEBUG)

In [2]:
# function that loads training data from csv, and returns a dataframe
def get_data(directory=pathlib.Path(".", "data"), train_pattern="train.csv"):
    train_path = directory / train_pattern
    return pandas.read_csv(str(train_path))

In [3]:
# Function that retrieves an SVC object. If there is a model saved locally under models/,
# then this will simply unpack the bytes using pickle and return the model object. 
# If not, then it will create a new model from scratch, using the training data provided from Kaggle.     
def get_model(train_data, directory=pathlib.Path(".", "models")):
    path = directory / "model.bin"
    if path.exists():
        logging.info("Loading model from memory.")
        with open(str(path), "rb") as f:
            model = pickle.load(f)
    else:
        logging.info("No model found in memory, creating one now.")
        model = svm.SVC(gamma="scale")
        scaler = MinMaxScaler(feature_range=(0, 255))
        scores = []
        y = train_data.label.to_numpy()
        x = scaler.fit_transform(
            train_data.drop(columns="label")
        )
        logging.info("Training model...")
        kf = KFold(n_splits=10)
        for train_i, test_i in kf.split(x):
            x_train, y_train = x[train_i], y[train_i]
            x_test, y_test = x[test_i], y[test_i]
            model.fit(x_train, y_train)
            scores.append(model.score(x_test, y_test))
        logging.info("Done.")
        logging.info(f"Accuracy: {mean(scores)}")
        with open(str(path), "wb") as f:
            pickle.dump(model, f)
    return model

In [4]:
# Function that makes predictions against the test.csv file from Kaggle, 
# and outputs it to output/output.csv
def make_predictions(model):
    in_path = str(pathlib.Path(".", "data", "test.csv"))
    out_path = str(pathlib.Path(".", "output", "output.csv"))
    logging.info("Making predictions...")
    with open(in_path, "r") as in_data_file, open(out_path, "w") as out_file:
        i = 1
        writer = csv.writer(out_file)
        writer.writerow(["ImageId", "Label"])
        reader = csv.reader(in_data_file)
        next(reader)
        for row in reader:
            writer.writerow(
                [
                    i,
                    model.predict(numpy.array([row]))[0]
                ]
            )
            i += 1
    logging.info("Done.")

In [5]:
# Put it all together, grab data, grab model, make predictions
def main():
    make_predictions(
        get_model(
            get_data()
        )
    )
    

In [6]:
# run the program
if __name__ == "__main__":
    main()

INFO:root:Loading model from memory.
INFO:root:Making predictions...
INFO:root:Done.
