In [66]:
# Imports
import numpy as np
import pandas as pd
import time
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Data Preprocessing 

In [67]:
# Defines
YEAR = 2017
THRESHOLD = .60
FILL_NA = 0
DATE_CONVERSION = 'timestamps'
PREPROCESSING = 'MeanNormalization'
KFOLD_SPLITS = 10

In [68]:
# Functions
def check_na(train):
    # Finds the number of missing values in each column
    num_of_na = [train.loc[:,col].isnull().sum() for col in train]
    # Divide by rows for proportion 
    prop_na = [num / train.shape[0] for num in num_of_na]
    # Put the proporitons and column names into a df and sort
    na_df = pd.DataFrame({'prop_na' : prop_na, 'column' : train.columns}).sort_values('prop_na')
    return na_df

In [69]:
# Read csvs
if YEAR == 2016:
    properties = pd.read_csv('properties_2016.csv', low_memory = False)
    train = pd.read_csv('train_2016_v2.csv', low_memory = False)
elif YEAR == 2017:
    properties = pd.read_csv('properties_2017.csv', low_memory = False)
    train = pd.read_csv('train_2017.csv', low_memory = False)
# train has Y and properties has features
# Find row intersection of train and properties
train = train.merge(properties, on = 'parcelid', how = 'left')

In [70]:
# Remove all columns above the THRESHOLD
train = train.loc[:, (train.isnull().sum(axis=0) <= (train.shape[0]*THRESHOLD))]

In [71]:
# Replace all NAs with number defined in FILL_NA
# train = train.fillna(FILL_NA)
# Convert transactiondate strings into floats
date_strings = (train.values[:,2])
date_converted = []
if DATE_CONVERSION == 'timestamps':
    for string in date_strings:
        date_converted.append(time.mktime(datetime.datetime.strptime(string, "%Y-%m-%d").timetuple()))
train['transactiondate'] = np.asarray(date_converted)
# Drop the columns with string and int
train = train.drop(columns=['propertycountylandusecode', 'propertyzoningdesc'])
y = train.values[:,1]
y = y.reshape(y.shape[0],1)

In [72]:
# Mean normalization
if PREPROCESSING == "MeanNormalization":
    train = (train - train.mean()) / (train.max() - train.min())
# Fill the missing values
train = train.fillna(FILL_NA)

# Implementation of K-Fold Cross-Validation 

In [73]:
# Preprocessing
x = train.values[:,2:]
#if PREPROCESSING == 'MinMax':
    #scaler = MinMaxScaler()
    #scaler.fit(x)
    #x = scaler.transform(x)
# KFolds
train_index_array = []
test_index_array = []
kf = KFold(n_splits = KFOLD_SPLITS, shuffle = True, random_state = 1)
for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)
    train_index_array.append(train_index)
    test_index_array.append(test_index)
MY_INDEX = 1
x_train, x_test = x[train_index_array[MY_INDEX]], x[test_index_array[MY_INDEX]]
y_train, y_test = y[train_index_array[MY_INDEX]], y[test_index_array[MY_INDEX]]

TRAIN: [    0     2     3 ... 77609 77611 77612] TEST: [    1    28    30 ... 77602 77607 77610]
TRAIN: [    0     1     2 ... 77610 77611 77612] TEST: [   10    24    25 ... 77586 77593 77595]
TRAIN: [    0     1     2 ... 77610 77611 77612] TEST: [    3    12    13 ... 77582 77603 77608]
TRAIN: [    0     1     2 ... 77610 77611 77612] TEST: [    8    23    40 ... 77567 77597 77599]
TRAIN: [    0     1     3 ... 77610 77611 77612] TEST: [    2    11    22 ... 77569 77574 77605]
TRAIN: [    0     1     2 ... 77609 77610 77611] TEST: [   33    37    39 ... 77585 77601 77612]
TRAIN: [    0     1     2 ... 77610 77611 77612] TEST: [   26    35    36 ... 77587 77588 77591]
TRAIN: [    1     2     3 ... 77609 77610 77612] TEST: [    0     4     6 ... 77600 77606 77611]
TRAIN: [    0     1     2 ... 77610 77611 77612] TEST: [    5    41    49 ... 77592 77604 77609]
TRAIN: [    0     1     2 ... 77610 77611 77612] TEST: [    9    15    18 ... 77589 77594 77596]


# Implementation of Linear Regression
##### For k-folds, set MY_INDEX variable as the desired fold index in the above cell and re-run the linear regression
##### Execution of only one fold is shown

In [78]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_squared_error as mse

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
model = regr.fit(x_train, y_train)
prediction = model.predict(x_test) 
print("The MSE for the model above is: ", mse(y_test, prediction))

The MSE for the model above is:  0.037502109792741566
