# Preprocessing

In [1]:
# import libraries
import numpy as np
from sklearn.model_selection import cross_val_score

In [2]:
# clone github repo
!git clone https://github.com/ch-zheng/cchs-prediction.git
%cd cchs-prediction/

fatal: destination path 'cchs-prediction' already exists and is not an empty directory.
/content/cchs-prediction


In [3]:
# load data
samples = np.load('data/samples.npy')
labels = np.load('data/labels.npy')

In [4]:
# train data
X_train = np.load('data/training/samples.npy')
y_train = np.load('data/training/labels.npy')

# test data
X_test = np.load('data/test/samples.npy')
y_test = np.load('data/test/labels.npy')

# Create models

In [5]:
# import models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeClassifier
from sklearn import neighbors
from sklearn import svm
from sklearn.naive_bayes import GaussianNB

In [6]:
# initialize models
tree = DecisionTreeClassifier()
logreg = LogisticRegression(max_iter=1000)
# tktk lasso
# tktk polyreg
ridge = RidgeClassifier()
knearest = neighbors.KNeighborsClassifier(n_jobs=-1)
svm_model = svm.SVC()
naive = GaussianNB()

In [8]:
# models dictionary
# add lasso, polyreg to dictionary (TODO)
models = {
    "Decision Tree": tree,
    "Logistic Regression": logreg,
    "Ridge Regression": ridge,
    "K-Nearest Neighbors": knearest,
    "SVM": svm_model,
    "Naive Bayes": naive
}

# Evaluate models

### Average accuracy

In [9]:
# avg_accuracy dictionary
avg_accuracy = {}
avg_accuracy["Multilayer Perceptron"] = 0.900 # mlp.py

In [10]:
for name, m in models.items():
  scores = cross_val_score(m, samples, labels, cv=100, n_jobs=-1) # 100-fold cross-validation
  avg_accuracy[name] = sum(scores) / len(scores)

In [11]:
# display avg_accuracy
for i in sorted(avg_accuracy, key=avg_accuracy.get, reverse=True):
  print("%-30s%-20s" % (i, "{:.2f}".format(avg_accuracy[i]*100)))

Multilayer Perceptron         90.00               
Ridge Regression              81.94               
Logistic Regression           78.05               
K-Nearest Neighbors           71.52               
Decision Tree                 70.45               
SVM                           68.68               
Naive Bayes                   53.77               


# Generate coefficients

In [12]:
# fit models
for name, m in models.items():
  m.fit(X_train, y_train)

In [52]:
# coefficients dictionary
coeffs = {}
for name, m in models.items():
  try:
    coeffs[name] = m.coef_.tolist()[0] # NOTE: first two coeffs are for race, age
  except (AttributeError):
    continue

In [53]:
print(coeffs.keys())

dict_keys(['Logistic Regression', 'Ridge Regression'])


In [55]:
# write coefficient arrays to csv
import csv

OUTPUT = "data/coefficients.csv"

with open(OUTPUT, 'w') as csvfile:
  csvWriter = csv.writer(csvfile)
  
  # header
  header = ["model", "race", "age"]
  for i in range(68):
    header.append('x' + str(i))
    header.append('y' + str(i))
  csvWriter.writerow(header)

  # store coefficients
  for name, coeff_arr in coeffs.items():
    row = [name]
    for c in coeff_arr:
      row.append(c)
    csvWriter.writerow(row)