In [2]:
import pandas as pd
import numpy as np
import numpy.linalg as la

In [3]:
# load in raw dataset
df = pd.read_csv("./lab3_data/auto-mpg.tsv", sep='\t')
# standardize all of the continuous data
cont = ["cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year"]
for str in cont:
    df[str] = (df[str] - df[str].mean())/df[str].std()
# drop car name column b/c hard to make use of categorical data
df = df.drop(columns=["car_name"])
# one hot encode origin
df = pd.get_dummies(df, columns=["origin"], dtype=int)

df


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_1,origin_2,origin_3
0,-1,1.482053,-0.278900,-0.946947,2.065470,1.072424,-1.623241,1,0,0
1,-1,1.482053,-0.268980,-0.925994,1.646352,-0.196214,-1.623241,1,0,0
2,-1,1.482053,-0.093734,-0.881094,1.927726,-0.558682,-1.623241,1,0,0
3,-1,1.482053,-0.232609,-0.896061,1.653416,-0.739916,-1.623241,1,0,0
4,-1,1.482053,-0.126800,-0.985860,0.808116,-1.646086,-0.808850,1,0,0
...,...,...,...,...,...,...,...,...,...,...
387,1,-0.862911,1.691792,-0.087867,-0.756513,2.957258,1.091394,0,1,0
388,1,-0.862911,1.923249,0.031865,-0.997859,3.283479,1.634321,0,1,0
389,1,-0.862911,1.691792,-0.087867,-1.050838,2.232322,1.091394,0,1,0
390,1,-0.862911,1.724858,0.480861,-1.327503,-0.631175,1.091394,0,0,1


In [11]:
data = df.to_numpy()
# randomly shuffling data for training/test purposes
np.random.shuffle(data)
labels = np.array([data[:, 0]])
data = np.delete(data, 0, 1).T

labels.shape

(1, 392)

In [5]:
def averaged_perceptron(data, labels, T):
    d, n = data.shape
    theta = np.zeros((d,1))
    theta_0 = np.zeros(1)
    ths = np.zeros((d,1))
    th0s = np.zeros(1)
  
    for t in range(T):     
      for i in range(n):
        y = labels[0,i]
        x = data[:,i]
        
        a = np.dot(x,theta)+theta_0
        if np.sign(y*a)[0] <=0: # update the thetas
          theta[:,0] = theta[:,0]+ y*x
          theta_0 = theta_0 + y
        
        ths += theta
        th0s += theta_0
    return (ths/(n*T),th0s/(n*T))


In [6]:
def score(data_test, labels_test, th, th0):
    d, n = data_test.shape
    ret = 0
    for i in range(n):
        x = data_test[:, i]
        y = labels_test[0, i]
        a = np.dot(x, th)+th0
        if np.sign(a)[0]==y:
            ret+=1
    return ret

In [7]:

def eval_classifier(learner, data_train, labels_train, data_test, labels_test):
    th, th0 = learner(data_train, labels_train, 100)
    return score(data_test, labels_test, th, th0)/labels_test.shape[1]

In [8]:
def xval_learning_alg(learner, data, labels, k):
  d, n = data.shape
  a = np.array_split(data, k, axis=1)
  b = np.array_split(labels, k, axis=1)
  score = 0
  for i in range(k):
    # initialize testing data/labels
    data_test = a[i]
    label_test = b[i]

    data_train = np.zeros((d, 1))
    label_train = np.zeros((1, 1))
    for j in range(0, k):
      if i==j:
        continue
      data_train = np.concatenate((data_train, a[j]), axis=1)
      label_train = np.concatenate((label_train, b[j]), axis=1)
    data_train = np.delete(data_train, 0, 1)
    label_train = np.delete(label_train, 0, 1)
    score += eval_classifier(learner, data_train, label_train, data_test, label_test)
  return score/k


In [9]:
xval_learning_alg(averaged_perceptron, data, labels, 10)

0.8901282051282051