### Packages

In [14]:
import numpy as np
import csv
import string
import collections
import random

### Functions

In [64]:
def load_data(file):
    X_raw = []
    y = []

    with open(file, 'rt', encoding='utf8') as f:
        dict_reader = csv.DictReader(f)

        for observation in dict_reader:
            y.append(observation['gross_profit'])
            observation.pop('gross_profit', None)

            for key in ['store','month', 'longitude', 'latitude', 'city', 'county', 'bottles', 'pop_city', 'fips', 'pop_county']:
                observation.pop(key, None)

            X_raw.append(observation)

    return X_raw, y

def convert_X_raw_to_array(X_raw):
    feature_keys = list(X_raw[0].keys())
    X = []

    for obs in X_raw:
        row = []

        for key in feature_keys:
            value = obs[key]
            row.append(value)

        X.append(row)

    return np.array(X)

def train_test_valid_split(X, y):
    random.seed(32)

    X_train = []
    X_test = []
    X_valid = []
    y_train = []
    y_test = []
    y_valid = []

    for index in range(len(X)):
        z = random.uniform(0, 1)

        if z < 0.7:
          X_train.append(X[index])
          y_train.append(y[index])

        elif z < 0.85:
            X_test.append(X[index])
            y_test.append(y[index])

        else:
            X_valid.append(X[index])
            y_valid.append(y[index])
                
    X_train = np.array(X_train, dtype=float)
    y_train = np.array(y_train, dtype=float)
    X_test = np.array(X_test, dtype=float)
    y_test = np.array(y_test, dtype=float)
    X_valid = np.array(X_valid, dtype=float)
    y_valid = np.array(y_valid, dtype=float)

    return X_train, X_test, X_valid, y_train, y_test, y_valid

def add_ones(X):
  beta_zero_column = np.ones((X.shape[0], 1))
  X_ones = np.hstack((beta_zero_column, X))

  return X_ones

def ols(X, y):
    X = X.astype(float)
    y = y.astype(float)

    # delete zero rows
    X = X[~np.all(X == 0, axis=1)]

    # delete zero columns
    X = X[:, ~np.all(X == 0, axis=0)]

    # ols
    Xt = X.transpose()
    Xt_X = np.dot(Xt, X)
    Xt_X_inv = np.linalg.pinv(Xt_X)
    Xt_X_inv_Xt = np.dot(Xt_X_inv, Xt)
    beta_hat = np.dot(Xt_X_inv_Xt, y)

    return beta_hat

def recenter(X, y):
    X = X - np.mean(X, 0)
    y = y - np.mean(y)

    return X, y

def ridge(X, y, fLambda):
    X = np.array(X)
    y = np.array(y)
    
    # delete zero rows
    X = X[~np.all(X == 0, axis=1)]
    
    # delete zero columns
    X = X[:, ~np.all(X == 0, axis=0)]
    
    # ridge
    Xt = X.transpose()
    Xt_X = np.dot(Xt, X)
    lambda_I = fLambda * np.identity(X.shape[1])
    Xt_X_lambda_I_inv = np.linalg.inv(Xt_X + lambda_I)
    Xt_X_lambda_I_inv_Xt = np.dot(Xt_X_lambda_I_inv, Xt)
    beta_hat = np.dot(Xt_X_lambda_I_inv_Xt, y)

    return beta_hat

# new function to get the cols removed, got code from ols
def get_removed_col_indices(X):
    X = np.array(X)
    original_cols = np.arange(X.shape[1])
    non_zero_cols = ~np.all(X  == 0, axis=0)
    removed_cols = original_cols[~non_zero_cols]
    return removed_cols, original_cols

# new function to remove features
def remove_features(feat_list, removed_cols):
  return [feature for i, feature in enumerate(feat_list) if i not in removed_cols]

# prediction
def predict(X, beta):
  X = np.array(X)
  X = X[:, ~np.all(X==0, axis=0)]
  beta = np.array(beta)
  y_hat = np.dot(X, beta)

  return y_hat

# calculate mse
def mean_squared_error(y_hat, y):
  mse = np.mean((y.flatten() - y_hat) ** 2)

  return mse

# 1. OLS Regression

## 1-1. load_data()

In [16]:
X_raw, y = load_data(file="data/iowa.csv")
y

['4499.84',
 '4598.18',
 '4840.74',
 '3828.55',
 '4691.36',
 '2410.09',
 '3672.24',
 '2836.75',
 '2938.72',
 '2105.74',
 '1793.33',
 '3507.85',
 '486.98',
 '544.34',
 '831.8399999999999',
 '102.0',
 '294.0',
 '1563.84',
 '1202.16',
 '1525.2',
 '1256.16',
 '495.12',
 '984.6',
 '450.48',
 '877.8000000000002',
 '9494.44',
 '14529.21',
 '19482.74',
 '17362.5',
 '13655.67',
 '21574.42',
 '14837.93',
 '16636.43',
 '19035.71',
 '14874.43',
 '17111.48',
 '22558.8',
 '9904.46',
 '7945.599999999999',
 '5346.0',
 '7236.809999999999',
 '6814.63',
 '6366.48',
 '10028.69',
 '7121.56',
 '8110.29',
 '6354.69',
 '7392.8',
 '8375.86',
 '1025.14',
 '1092.56',
 '2010.66',
 '2292.18',
 '2352.6',
 '1737.84',
 '2580.4700000000003',
 '1977.49',
 '3923.7',
 '2645.84',
 '1638.03',
 '4271.68',
 '335.16',
 '341.52',
 '419.15',
 '440.2000000000001',
 '408.18',
 '427.25',
 '416.2000000000001',
 '355.28000000000003',
 '349.33',
 '410.04',
 '558.59',
 '693.22',
 '369.77',
 '617.04',
 '733.85',
 '355.1',
 '377.72',
 '

In [25]:
X_raw[0]

{'cat_convenience_store': '0',
 'cat_distillery_brewery': '0',
 'cat_gas_station': '0',
 'cat_general_store': '0',
 'cat_grocery': '0',
 'cat_liquor_store_bar': '1',
 'cat_other': '0',
 'cat_pharmacy': '0',
 'cat_unknown': '0',
 'female_18_24': '351',
 'female_25_34': '445',
 'female_35_44': '464',
 'female_45_64': '1051',
 'female_65_over': '1093',
 'male_18_24': '406',
 'male_25_34': '491',
 'male_35_44': '571',
 'male_45_64': '1071',
 'male_65_over': '872',
 'annual_income': '65080',
 'excessive_drinking': '26',
 'gassale_x_gasstore': '0',
 'l_10th_mountain_american_single_malt': '0',
 'l_10th_mountain_bourbon': '0',
 'l_10th_mountain_brandy': '0',
 'l_10th_mountain_cordial': '0',
 'l_10th_mountain_rye': '0',
 'l_10th_mountain_vodka': '0',
 'l_135_east_hyogo_japanese_gin': '0',
 'l_15_stars_private_stock_8_and_15yr': '0',
 'l_173_craft_distillery_barrel_kane': '0',
 'l_173_craft_distillery_bourbon_whiskey': '0',
 'l_173_craft_distillery_broken_beaker_silver_rum': '0',
 'l_173_craft_

## 1-2. convert_X_raw_to_array()

In [17]:
X = convert_X_raw_to_array(X_raw)

In [44]:
X.shape

(23183, 4533)

## 1-3. train_test_valid_split()

In [18]:
X_train, X_test, X_valid, y_train, y_test, y_valid = train_test_valid_split(X, y)

In [11]:
print(X_train.shape)
print(y_train.shape[0])

(16199, 4533)
16199


In [48]:
print(X_test.shape)
print(y_test.shape[0])

(3549, 4533)
3549


In [46]:
print(X_valid.shape)
print(y_valid.shape[0])

(3435, 4533)
3435


## 1-4. add_ones()

In [19]:
X_ones = add_ones(X)

In [20]:
X_ones[0]

array(['1.0', '0', '0', ..., '0', '0', '0'], shape=(4534,), dtype='<U32')

## 1-5. ols()

In [23]:
beta_ols = ols(X_train, y_train)

In [24]:
beta_ols

array([-160.89194834,   -3.33339453,  -56.67157546, ...,    0.42340152,
          3.21451152,    0.56079671], shape=(4299,))

# 2. Redge Regression

## 2-1. Training Data

In [74]:
X_train, y_train = recenter(X_train, y_train)
beta_hat_train = ridge(X_train, y_train, fLambda=1)

In [75]:
removed_cols, original_cols = get_removed_col_indices(X_train)
feat_list = list(X_raw[0].keys())
final_cols_train = remove_features(feat_list, removed_cols)
beta_hat_train_labeled = np.column_stack((final_cols_train, beta_ridge))
np.savetxt("beta_hat_train_labeled.csv", beta_ridge_labeled, fmt="%s", delimiter=",")

In [76]:
y_hat_train = predict(X_train, beta_hat_train)
mse_train = mean_squared_error(y_hat_train, y_train)
print('MSE in sample:', mse_train)

MSE in sample: 264841.8511735275


## 2-2. Testing Data

In [69]:
X_test, y_test = recenter(X_test, y_test)
beta_hat_test = ridge(X_test, y_test, fLambda=1)

In [70]:
removed_cols, original_cols = get_removed_col_indices(X_test)
feat_list = list(X_raw[0].keys())
final_cols_test = remove_features(feat_list, removed_cols)
beta_hat_test_labeled = np.column_stack((final_cols_test, beta_hat_test))
np.savetxt("beta_hat_test_labeled.csv", beta_hat_test_labeled, fmt="%s", delimiter=",")

In [71]:
y_hat_test = predict(X_test, beta_hat_test)
mse_test = mean_squared_error(y_hat_test, y_test)
print('MSE out sample:', mse_test)

MSE out sample: 19563.101172821785
