In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv


In [2]:
NUM_FOLDS = 10

import csv
train_csv_path = '/kaggle/input/digit-recognizer/train.csv'
test_csv_path = '/kaggle/input/digit-recognizer/test.csv'
max_tam = 1000

def read_data():
    with open(train_csv_path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        next(csv_reader, None)  # skip the headers
        X = []
        y = []
        for row in csv_reader:
            X.append(list(map(int, row[1:])))
            y.append(row[0])
        if max_tam != None:
            X = X[:max_tam]
            y = y[:max_tam]
        return np.array(X),np.array(y)

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold


def evaluate(model):
    # read the data
    X, y = read_data()

    # define the evaluation procedure
    cv = RepeatedKFold(n_splits=NUM_FOLDS, n_repeats=1, random_state=1)

    # evaluate the model and collect the scores
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

    return mean(scores)

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
models = {
    "KNeighborsClassifier": KNeighborsClassifier(),
    "SVC": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "MLPClassifier": MLPClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),
    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis()
}

models = {
    "KNeighborsClassifier": KNeighborsClassifier(),
    "SVC": SVC(random_state=0),
    "Random Forest": RandomForestClassifier(random_state=0),
    "MLPClassifier": MLPClassifier(random_state=0),
}

In [5]:
from numpy import mean

def try_all_models():
    names = models.keys()
    scores = {}
    for name in names:
        print(f"Evaluating {name}")
        model = models[name]
        score = evaluate(model)
        scores[name] = score
    print(scores)

In [6]:
try_all_models()

Evaluating KNeighborsClassifier
Evaluating SVC
Evaluating Random Forest
Evaluating MLPClassifier
{'KNeighborsClassifier': 0.8459999999999999, 'SVC': 0.9, 'Random Forest': 0.8870000000000001, 'MLPClassifier': 0.784}


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold

def tuning():
    # read the data
    X, y = read_data()

    # define model
    model = SVC(random_state=0)

    # find best parameters
    param_test = {'kernel':['linear', 'poly', 'rbf']}
    gsearch = GridSearchCV(estimator = model,
                           param_grid = param_test,
                           scoring='accuracy',n_jobs=-1,
                           cv = RepeatedKFold(n_splits=NUM_FOLDS, n_repeats=1, random_state=1))
    gsearch.fit(X, y)
    print(gsearch.best_params_)
    print(gsearch.best_score_)

In [8]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
import numpy as np

class MyOwnClassifier(BaseEstimator, ClassifierMixin):
    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.samples = {}
        for y_possible_value in self.classes_:
            self.samples[y_possible_value] = [X[i] for i in range(len(X)) if y[i]==y_possible_value]
        return self
    def predict(self, X):
        ret = np.array([])
        for x in X:
            probas = self.predict_proba([x])
            i = np.argmax(probas)
            ret = np.append(ret, self.classes_[i])
        return ret
    def predict_proba(self, X):
        ret = np.array([])
        for x in X:
            mean_correlations = {}
            for y_possible_value in self.classes_:
                sum_correlations = 0
                for sample in self.samples[y_possible_value]:
                    correlation = sum(x * sample)
                    sum_correlations += correlation
                mean_correlation = sum_correlations/len(self.samples[y_possible_value])
                mean_correlations[y_possible_value] = mean_correlation
            arr = np.array(list(mean_correlations.values()))
            probas = arr / sum(arr)
            ret = np.append(ret, probas)
        return np.reshape(ret, (len(X), -1))

In [9]:
def read_test_data():
    with open(test_csv_path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        next(csv_reader, None)  # skip the headers
        X = []
        for row in csv_reader:
            X.append(list(map(int, row)))
        return np.array(X)

In [10]:
X_train,y_train = read_data()
X_test = read_test_data()

In [11]:
model = SVC(random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [12]:
predictions = y_pred
submission = pd.DataFrame()
submission['ImageId'] = range(28000)
submission['Label'] = predictions.reshape((predictions.shape[0]))
submission.to_csv('sub.csv', index=False)

In [13]:
submission.to_csv('sub.csv', index=False)