# Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import glob
import math
import pickle

# Import Data

In [2]:
train_set = pd.read_csv("data/trainingData.csv")
test_set = pd.read_csv("data/validationData.csv")

# Evaluate Data

In [3]:
print(pd.isnull(train_set).values.any())
print(pd.isnull(test_set).values.any())

False
False


# Preprocess Data and Feature Engineering

Training/validation Set

In [4]:
train_set.iloc[:, 0:520].min().min()
train_set_P = train_set.copy()
train_set_P.iloc[:, 0:520] = np.where(train_set_P.iloc[:, 0:520] <= 0, train_set_P.iloc[:, 0:520] + 105, train_set_P.iloc[:, 0:520] - 100) 

In [5]:
combined = pd.concat([train_set_P, test_set])
combined = combined.assign(UNIQUELOCATION = (combined['LONGITUDE'].astype(str) + '_' + combined['LATITUDE'].astype(str) + '_' + combined['FLOOR'].astype(str) + '_' + combined['BUILDINGID'].astype(str)).astype('category').cat.codes)
len(combined["UNIQUELOCATION"].unique())

1997

In [6]:
train_set_PU = combined.iloc[0:19937, :]
test_set_U = combined.iloc[19937:21048, :]

In [7]:
train_set_PU["UNIQUELOCATION"] = train_set_PU["UNIQUELOCATION"].astype("category")
train_set_PU.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set_PU["UNIQUELOCATION"] = train_set_PU["UNIQUELOCATION"].astype("category")


WAP001                 int64
WAP002                 int64
WAP003                 int64
WAP004                 int64
WAP005                 int64
                      ...   
RELATIVEPOSITION       int64
USERID                 int64
PHONEID                int64
TIMESTAMP              int64
UNIQUELOCATION      category
Length: 530, dtype: object

In [8]:
X_train = train_set_PU.iloc[:, 0:520]
y_train = train_set_PU.iloc[:, 520:530]

Test Set

In [9]:
test_set_PU = test_set_U.copy()
test_set_PU.iloc[:, 0:520] = np.where(test_set_PU.iloc[:, 0:520] <= 0, test_set_PU.iloc[:, 0:520] + 105, test_set_PU.iloc[:, 0:520] - 100) 

In [10]:
test_set_PU["UNIQUELOCATION"] = test_set_PU["UNIQUELOCATION"].astype("category")
test_set_PU.dtypes

WAP001                 int64
WAP002                 int64
WAP003                 int64
WAP004                 int64
WAP005                 int64
                      ...   
RELATIVEPOSITION       int64
USERID                 int64
PHONEID                int64
TIMESTAMP              int64
UNIQUELOCATION      category
Length: 530, dtype: object

In [11]:
X_test = test_set_PU.iloc[:, 0:520]
y_test = test_set_PU.iloc[:, 520:530]

In [12]:
ref_table = pd.concat([y_train.iloc[:, [0,1,2,3,9]], y_test.iloc[:, [0,1,2,3,9]]])
ref_table = ref_table.drop_duplicates()

In [13]:
def save_data(dataframe, filename):
    file_present = glob.glob(filename)
    if not file_present:
        dataframe.to_csv(filename)
    else:
        print('WARNING: This file already exists.')

In [14]:
del train_set, train_set_P, train_set_PU, test_set, test_set_U, test_set_PU, combined

# Train Model(s)

Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state = 0)

from sklearn.model_selection import GridSearchCV
hyperparameters = {'criterion': ['gini'], 
                'max_depth': [None], 
                'max_features': ['sqrt'],
                'n_estimators': [60]}

from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import make_scorer
scoring = {'accuracy': 'accuracy',
            'kappa': make_scorer(cohen_kappa_score)}

grid = GridSearchCV(estimator = classifier,
                    param_grid = hyperparameters,
                    scoring = scoring,
                    cv = 2,
                    refit = 'accuracy',
                    return_train_score = True,
                    n_jobs = -1) 

tic = time.time()
grid_result = grid.fit(X_train, y_train.iloc[:, 9].squeeze())
toc = time.time()
run_time = (toc - tic)/60

In [16]:
cv_results_ = pd.DataFrame.from_dict(grid_result.cv_results_) 
cv_results_.insert(loc = 0, column = 'Model', 
                   value = ['RandomForestClassifier']*cv_results_.shape[0])
cv_results_.insert(loc = 28, column = 'mean train - cross_val accuracy',  # loc = 60 if you use cv=10
                   value = cv_results_['mean_train_accuracy'] - cv_results_['mean_test_accuracy'])
cv_results_.insert(loc = 29, column = 'mean train - cross_val kappa',   # loc = 61 if you use cv=10
                   value = cv_results_['mean_train_kappa'] - cv_results_['mean_test_kappa'])
with open('tuning_rf.csv', 'a') as f:
    cv_results_.to_csv(f, header = True)

In [17]:
grid_result.best_estimator_
grid_result.best_score_
grid_result.best_params_

{'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'n_estimators': 60}

In [18]:
y_pred = grid_result.predict(X_test)
np.mean(y_pred == y_test.iloc[:, 9])

0.0054005400540054005

In [19]:
y_test_pos = y_test.iloc[:, 0:2].values 
y_test_floor = y_test.iloc[:, 2].values
y_test_building = y_test.iloc[:, 3].values

dict_loc = {}
m_total = ref_table.shape[0]
for i in range(m_total):
    key = int(ref_table.iloc[i]['UNIQUELOCATION'])
    value = ref_table.iloc[i, 0:4].values
    dict_loc[key] = value

y_pred_pos = np.asarray([dict_loc[i] for i in y_pred])[:, 0:2] 
y_pred_floor = np.asarray([dict_loc[i] for i in y_pred])[:, 2]
y_pred_building = np.asarray([dict_loc[i] for i in y_pred])[:, 3]

In [20]:
def euclidean(y_test_pos, y_pred_pos):
    m_test = y_test_pos.shape[0]
    D_error = np.sum((y_test_pos - y_pred_pos)**2, axis = 1)**0.5
    return D_error

In [21]:
D_error = euclidean(y_test_pos, y_pred_pos)
sorted_D_error = sorted(D_error)

m_test = y_test.shape[0]
mean_error = np.mean(D_error) 
percentile_25th = sorted_D_error[math.ceil(m_test*0.25) - 1] 
percentile_50th = sorted_D_error[math.ceil(m_test*0.50) - 1] 
percentile_75th = sorted_D_error[math.ceil(m_test*0.75) - 1] 
percentile_95th = sorted_D_error[math.ceil(m_test*0.95) - 1] 
percentile_100th = sorted_D_error[math.ceil(m_test*1.00) - 1] 
building_hitrate = np.mean(y_test_building == y_pred_building)
floor_hitrate = np.mean(y_test_floor == y_pred_floor)

In [22]:
mean_error = np.mean(D_error)
print(mean_error)

8.578918235862663


<h1>Decision Tree</h1>

In [23]:
from sklearn.tree import DecisionTreeClassifier
neural = DecisionTreeClassifier(random_state=0)
scoring = {'accuracy': 'accuracy', 'kappa': make_scorer(cohen_kappa_score)}

tic = time.time()
neural.fit(X_train, y_train.iloc[:, 9].squeeze())
toc = time.time()
run_time = (toc - tic)/60
print(run_time)

0.09467534224192302


In [24]:
y_pred = neural.predict(X_test)
np.mean(y_pred == y_test.iloc[:, 9])

0.0027002700270027003

In [25]:
y_test_pos = y_test.iloc[:, 0:2].values
y_test_floor = y_test.iloc[:, 2].values
y_test_building = y_test.iloc[:, 3].values

dict_loc = {}
m_total = ref_table.shape[0]
for i in range(m_total):
    key = int(ref_table.iloc[i]['UNIQUELOCATION'])
    value = ref_table.iloc[i, 0:4].values
    dict_loc[key] = value

y_pred_pos = np.asarray([dict_loc[i] for i in y_pred])[:, 0:2]
y_pred_floor = np.asarray([dict_loc[i] for i in y_pred])[:, 2]
y_pred_building = np.asarray([dict_loc[i] for i in y_pred])[:, 3]

In [26]:
D_error = euclidean(y_test_pos, y_pred_pos)
sorted_D_error = sorted(D_error)

m_test = y_test.shape[0]
mean_error = np.mean(D_error) 
percentile_25th = sorted_D_error[math.ceil(m_test*0.25) - 1] 
percentile_50th = sorted_D_error[math.ceil(m_test*0.50) - 1] 
percentile_75th = sorted_D_error[math.ceil(m_test*0.75) - 1] 
percentile_95th = sorted_D_error[math.ceil(m_test*0.95) - 1] 
percentile_100th = sorted_D_error[math.ceil(m_test*1.00) - 1] 
building_hitrate = np.mean(y_test_building == y_pred_building)
floor_hitrate = np.mean(y_test_floor == y_pred_floor)

In [27]:
mean_error = np.mean(D_error)
print(mean_error)

13.840359196110223


<h1>kNN</h1>

In [28]:
from sklearn.neighbors import KNeighborsClassifier
neural = KNeighborsClassifier(n_neighbors=5, weights="distance")
scoring = {'accuracy': 'accuracy', 'kappa': make_scorer(cohen_kappa_score)}

tic = time.time()
neural.fit(X_train, y_train.iloc[:, 9].squeeze())
toc = time.time()
run_time = (toc - tic)/60
print(run_time)

0.00030085245768229164


In [29]:
y_pred = neural.predict(X_test)
np.mean(y_pred == y_test.iloc[:, 9])

0.0054005400540054005

In [30]:
y_test_pos = y_test.iloc[:, 0:2].values
y_test_floor = y_test.iloc[:, 2].values
y_test_building = y_test.iloc[:, 3].values

dict_loc = {}
m_total = ref_table.shape[0]
for i in range(m_total):
    key = int(ref_table.iloc[i]['UNIQUELOCATION'])
    value = ref_table.iloc[i, 0:4].values
    dict_loc[key] = value

y_pred_pos = np.asarray([dict_loc[i] for i in y_pred])[:, 0:2]
y_pred_floor = np.asarray([dict_loc[i] for i in y_pred])[:, 2]
y_pred_building = np.asarray([dict_loc[i] for i in y_pred])[:, 3]


In [31]:
D_error = euclidean(y_test_pos, y_pred_pos)
sorted_D_error = sorted(D_error)

m_test = y_test.shape[0]
mean_error = np.mean(D_error)
percentile_25th = sorted_D_error[math.ceil(m_test*0.25) - 1]
percentile_50th = sorted_D_error[math.ceil(m_test*0.50) - 1]
percentile_75th = sorted_D_error[math.ceil(m_test*0.75) - 1]
percentile_95th = sorted_D_error[math.ceil(m_test*0.95) - 1]
percentile_100th = sorted_D_error[math.ceil(m_test*1.00) - 1]
building_hitrate = np.mean(y_test_building == y_pred_building)
floor_hitrate = np.mean(y_test_floor == y_pred_floor)

In [32]:
mean_error = np.mean(D_error)
print(mean_error)

9.95949930276953


<h1>Gaussian NB</h1>

In [33]:
from sklearn.naive_bayes import GaussianNB
neural = GaussianNB()
scoring = {'accuracy': 'accuracy', 'kappa': make_scorer(cohen_kappa_score)}

tic = time.time()
neural.fit(X_train, y_train.iloc[:, 9].squeeze())
toc = time.time()
run_time = (toc - tic)/60
print(run_time)

0.017563374837239583


In [34]:
y_pred = neural.predict(X_test)
np.mean(y_pred == y_test.iloc[:, 9])

0.004500450045004501

In [35]:
y_test_pos = y_test.iloc[:, 0:2].values
y_test_floor = y_test.iloc[:, 2].values
y_test_building = y_test.iloc[:, 3].values

dict_loc = {}
m_total = ref_table.shape[0]
for i in range(m_total):
    key = int(ref_table.iloc[i]['UNIQUELOCATION'])
    value = ref_table.iloc[i, 0:4].values
    dict_loc[key] = value

y_pred_pos = np.asarray([dict_loc[i] for i in y_pred])[:, 0:2]
y_pred_floor = np.asarray([dict_loc[i] for i in y_pred])[:, 2]
y_pred_building = np.asarray([dict_loc[i] for i in y_pred])[:, 3]

In [36]:
D_error = euclidean(y_test_pos, y_pred_pos)
sorted_D_error = sorted(D_error)

m_test = y_test.shape[0]
mean_error = np.mean(D_error)
percentile_25th = sorted_D_error[math.ceil(m_test*0.25) - 1]
percentile_50th = sorted_D_error[math.ceil(m_test*0.50) - 1]
percentile_75th = sorted_D_error[math.ceil(m_test*0.75) - 1]
percentile_95th = sorted_D_error[math.ceil(m_test*0.95) - 1]
percentile_100th = sorted_D_error[math.ceil(m_test*1.00) - 1]
building_hitrate = np.mean(y_test_building == y_pred_building)
floor_hitrate = np.mean(y_test_floor == y_pred_floor)

In [37]:
mean_error = np.mean(D_error)
print(mean_error)

13.064457035409692
