In [None]:
import sklearn
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
BASE_URL = "drive/MyDrive/Citadel Women's Datathon/code"

In [None]:
zip_pcpark_income_df = pd.read_csv(f"{BASE_URL}/zip_pcpark_income.csv")

In [None]:
zip_pcpark_income_df.head()

Unnamed: 0.1,Unnamed: 0,ZIPCODE,PC_PARK,MEDIAN_INCOME
0,0,35903,3.411571,3
1,1,35904,2.7071,3
2,2,35906,9.7415,3
3,3,35972,0.0,3
4,4,35954,0.154,3


In [None]:
del zip_pcpark_income_df[zip_pcpark_income_df.columns[0]]

In [None]:
zip_pcpark_income_df.head()

Unnamed: 0,ZIPCODE,PC_PARK,MEDIAN_INCOME
0,35903,3.411571,3
1,35904,2.7071,3
2,35906,9.7415,3
3,35972,0.0,3
4,35954,0.154,3


In [None]:
# Treat all columns as ints
zip_pcpark_income_df["PC_PARK"] = zip_pcpark_income_df["PC_PARK"].astype(int)

In [None]:
data = {"training": [], "validation": [], "test": []}
labels = {"training": [], "validation": [], "test": []}

In [None]:
# Set aside 70% for training, 10% for validation, 20% for test
training_size = int(len(zip_pcpark_income_df) * 0.7)
validation_size = int(len(zip_pcpark_income_df) * 0.1)
test_size = int(len(zip_pcpark_income_df) * 0.2)

In [None]:
from sklearn.utils import shuffle

# Shuffle training data
zip_pcpark_income_df = shuffle(zip_pcpark_income_df)

In [None]:
data["training"] = zip_pcpark_income_df["PC_PARK"][:training_size].values.reshape(-1, 1)
labels["training"] = zip_pcpark_income_df["MEDIAN_INCOME"][:training_size].values.reshape(-1, 1)

data["validation"] = zip_pcpark_income_df["PC_PARK"][training_size: training_size + validation_size].values.reshape(-1, 1)
labels["validation"] = zip_pcpark_income_df["MEDIAN_INCOME"][training_size: training_size + validation_size].values.reshape(-1, 1)

data["test"] = zip_pcpark_income_df["PC_PARK"][training_size + validation_size: training_size + validation_size + test_size].values.reshape(-1, 1)
labels["test"] = zip_pcpark_income_df["MEDIAN_INCOME"][training_size + validation_size: training_size + validation_size + test_size].values.reshape(-1, 1)

# Sanity checks
assert len(data["training"]) == len(labels["training"])
assert len(data["validation"]) == len(labels["validation"])
assert len(data["test"]) == len(labels["test"])
assert len(data["validation"]) < len(data["test"]) and len(data["test"]) < len(data["training"])

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Tune C hyperparameter
c_accuracies = {}
c = 0.001
for _ in range(7):
  svc = SVC(max_iter=-1, C = c, verbose=False)
  svc.fit(data["training"], labels["training"])

  y_train_true = labels["training"]
  y_train_pred = svc.predict(data["training"])

  y_val_true = labels["validation"]
  y_val_pred = svc.predict(data["validation"])

  train_accuracy = accuracy_score(y_train_true, y_train_pred)
  validation_accuracy = accuracy_score(y_val_true, y_val_pred)

  c_accuracies[c] = validation_accuracy

  c *= 10

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
c_accuracies

{0.001: 0.6790281329923273,
 0.01: 0.6790281329923273,
 0.1: 0.6790281329923273,
 1.0: 0.6790281329923273,
 10.0: 0.6790281329923273,
 100.0: 0.6790281329923273,
 1000.0: 0.6790281329923273}

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Tune training size
training_size_accuracies = {}
pct = 0.6
for _ in range(5):
  training_size = int(len(data["training"]) * pct)
  svc = SVC(max_iter=-1, C = 1, verbose=False)
  training_data = data["training"][:training_size]
  training_labels = labels["training"][:training_size]
  svc.fit(training_data, training_labels)

  y_train_true = training_labels
  y_train_pred = svc.predict(training_data)

  y_val_true = labels["validation"]
  y_val_pred = svc.predict(data["validation"])

  train_accuracy = accuracy_score(y_train_true, y_train_pred)
  validation_accuracy = accuracy_score(y_val_true, y_val_pred)

  training_size_accuracies[pct] = validation_accuracy

  pct += .1

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
training_size_accuracies

{0.6: 0.6790281329923273,
 0.7: 0.6790281329923273,
 0.7999999999999999: 0.6790281329923273,
 0.8999999999999999: 0.6790281329923273,
 0.9999999999999999: 0.6790281329923273}

In [None]:
svc = SVC(max_iter=-1, C = 1, verbose=False)
svc.fit(data["training"], labels["training"])

y_train_true = labels["test"]
y_train_pred = svc.predict(data["test"])
test_accuracy = accuracy_score(y_val_true, y_val_pred)

  y = column_or_1d(y, warn=True)


In [None]:
test_accuracy

0.6790281329923273