In [8]:
import pandas as pd
import numpy as np
import scipy
from scipy.stats import norm
df = pd.read_csv("usps-2cls.csv", header= None)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,247,248,249,250,251,252,253,254,255,256
0,0,0,95,107,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,78,95,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,42,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [177]:
dir(df)

['T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_SLICEMAP',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__reduce__',

In [127]:
class Bayes:
    def fit(self,X,y):
        classes = np.unique(y)
        cls_dict={}
        for cls in classes:
            x_cls = X[y==cls]
            loc = x_cls.mean(axis=0)
            scale = np.std(x_cls, axis=0)
            dist = norm(loc,scale)
            cls_dict.update({cls:{"dist":dist}})
        self.cls_dict = cls_dict
    def pdf(self, dist, x):
        return (dist.pdf(x)*1000).prod() #*1000 for numerical instability
    def pred(self, x):
        scores=[self.pdf(x=x, **v) for v in self.cls_dict.values()]
        return np.argmax(scores)
    def predict(self, X):
        return list(map(self.pred, X))
        

In [164]:
from sklearn.model_selection import train_test_split
def get_score(X,y, model, p):
    test_size = 1-p
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    test = accuracy_score(y_test, y_pred_test)
    train = accuracy_score(y_train, y_pred_train)
    return train,test


In [129]:
n = 100

In [165]:
X = df.iloc[:,:-1].to_numpy()
y = df.iloc[:,-1].to_numpy()
score_dict = {"p":[], "train":[], "test":[]}
for p in [.1,.2,.5,.8,.9]:
    train_sc = []
    test_sc = []
    for i in range(n):
        bayes = Bayes() 
        train,test = get_score(X,y, bayes, p)
        train_sc.append(train)
        test_sc.append(test)
    train = np.round(sum(train_sc)/len(train_sc), 2)
    test = np.round(sum(test_sc)/len(test_sc), 2)
    score_dict["p"].append(p)
    score_dict["train"].append(1-train)
    score_dict["test"].append(1-test)


In [166]:
print(pd.DataFrame(score_dict).to_latex())

\begin{tabular}{lrrr}
\toprule
{} &    p &  train &  test \\
\midrule
0 &  0.1 &   0.50 &  0.50 \\
1 &  0.2 &   0.50 &  0.50 \\
2 &  0.5 &   0.47 &  0.47 \\
3 &  0.8 &   0.29 &  0.29 \\
4 &  0.9 &   0.19 &  0.19 \\
\bottomrule
\end{tabular}



In [172]:
# knn
from sklearn.neighbors import KNeighborsClassifier
kneighbor_dict = {}
for k in [5,10,15,30]:
    print(f"scores for k = {k}")
    score_dict = {"p":[], "train":[], "test":[]}
    for p in [.1,.2,.5,.8,.9]:
        train_sc = []
        test_sc = []
        for i in range(n):
            knn = KNeighborsClassifier(n_neighbors = k)  
            train,test = get_score(X,y, knn, p)
            train_sc.append(train)
            test_sc.append(test)
        train = np.round(sum(train_sc)/len(train_sc), 2)
        test = np.round(sum(test_sc)/len(test_sc), 2)
        score_dict["p"].append(p)
        score_dict["train"].append(1-train)
        score_dict["test"].append(1-test)
    kneighbor_dict.update({k:score_dict})


scores for k = 5
scores for k = 10
scores for k = 15
scores for k = 30


In [173]:
new_dict ={}
for key, value in kneighbor_dict.items():
    for k,v in value.items():
        if k =="p":
            continue
        new_dict.update({(f"k={key}", k): v})

In [174]:
kneighbor_dict

{5: {'p': [0.1, 0.2, 0.5, 0.8, 0.9],
  'train': [0.050000000000000044,
   0.030000000000000027,
   0.020000000000000018,
   0.020000000000000018,
   0.020000000000000018],
  'test': [0.08999999999999997,
   0.06000000000000005,
   0.040000000000000036,
   0.030000000000000027,
   0.030000000000000027]},
 10: {'p': [0.1, 0.2, 0.5, 0.8, 0.9],
  'train': [0.08999999999999997,
   0.06000000000000005,
   0.040000000000000036,
   0.030000000000000027,
   0.030000000000000027],
  'test': [0.13,
   0.08999999999999997,
   0.050000000000000044,
   0.040000000000000036,
   0.030000000000000027]},
 15: {'p': [0.1, 0.2, 0.5, 0.8, 0.9],
  'train': [0.08999999999999997,
   0.06000000000000005,
   0.040000000000000036,
   0.030000000000000027,
   0.030000000000000027],
  'test': [0.10999999999999999,
   0.07999999999999996,
   0.050000000000000044,
   0.040000000000000036,
   0.030000000000000027]},
 30: {'p': [0.1, 0.2, 0.5, 0.8, 0.9],
  'train': [0.14,
   0.09999999999999998,
   0.06000000000000005

In [157]:
kneighbor_dict[5]["p"]

[0.1, 0.2, 0.5, 0.8, 0.9]

In [175]:
r = pd.DataFrame(new_dict)
r.index = kneighbor_dict[5]["p"]
r.index.name = "p"
print(r.to_latex())

\begin{tabular}{lrrrrrrrr}
\toprule
{} & \multicolumn{2}{l}{k=5} & \multicolumn{2}{l}{k=10} & \multicolumn{2}{l}{k=15} & \multicolumn{2}{l}{k=30} \\
{} & train &  test & train &  test & train &  test & train &  test \\
p   &       &       &       &       &       &       &       &       \\
\midrule
0.1 &  0.05 &  0.09 &  0.09 &  0.13 &  0.09 &  0.11 &  0.14 &  0.16 \\
0.2 &  0.03 &  0.06 &  0.06 &  0.09 &  0.06 &  0.08 &  0.10 &  0.11 \\
0.5 &  0.02 &  0.04 &  0.04 &  0.05 &  0.04 &  0.05 &  0.06 &  0.07 \\
0.8 &  0.02 &  0.03 &  0.03 &  0.04 &  0.03 &  0.04 &  0.05 &  0.05 \\
0.9 &  0.02 &  0.03 &  0.03 &  0.03 &  0.03 &  0.03 &  0.05 &  0.05 \\
\bottomrule
\end{tabular}

