# 1_PRREDICT BUILDING

# Import libraries

In [1]:
# basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Classification Models
from sklearn.svm import LinearSVC
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# tools
from sklearn.model_selection import train_test_split

# classification model metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#Hyper parameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# To save the ML models
import joblib

# Functions

In [2]:
def save_model (model, file_name):
    try:
        path = '/home/ale/Dropbox/UBIQUM/4.DeepAnalytics&Visualization/T4M3.WiFiLocationing/MLmodels/'
 
        # Save the model as a pickle in a file
        joblib.dump(model, path + file_name)
        print('Model Saved')
    except:
        print('Model NOT!! Saved')

In [3]:
def load_model (file_name):
    path = '/home/ale/Dropbox/UBIQUM/4.DeepAnalytics&Visualization/T4M3.WiFiLocationing/MLmodels/'
 
    # Load the model from the file
    model_from_joblib = joblib.load(path + file_name)
    
    return model_from_joblib

# Import Data

In [4]:
path = '/home/ale/Dropbox/UBIQUM/4.DeepAnalytics&Visualization/T4M3.WiFiLocationing/Data/'
 
file = "0_DataPrepro.csv"

Tdata = pd.read_csv(path + file)

Tdata.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP520,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP
0,100,100,100,100,100,100,100,100,100,100,...,100,-7541.2643,4864921.0,2,1,106,2,2,23,1371713733
1,100,100,100,100,100,100,100,100,100,100,...,100,-7536.6212,4864934.0,2,1,106,2,2,23,1371713691
2,100,100,100,100,100,100,100,-97,100,100,...,100,-7519.1524,4864950.0,2,1,103,2,2,23,1371714095
3,100,100,100,100,100,100,100,100,100,100,...,100,-7524.5704,4864934.0,2,1,102,2,2,23,1371713807
4,100,100,100,100,100,100,100,100,100,100,...,100,-7632.1436,4864982.0,0,0,122,2,11,13,1369909710


# Cascade Model

Also remember the cascade model thing. Predict first one thing, then another, then another etc.. in every iteration we add the features we want to predict. 

Steps
1. remove uselsss features
2. dummify
3. sample to iterate, but just to trial error 
4. divide train/test
5. apply the models
6. performace metrics

# 1. Predict Building

start by predicting the building beacuse it's the easiest one, just 3 possibilities: 0, 1, 2. 

- Predictors: all the WAP Features
- Predicted: BUILDINGid

## 1.1 Keep Usefull features

In [5]:
# make a copy of the dataset, keeping only the predictor/predicted features
Tdata_Building = Tdata.drop(columns = ['LONGITUDE', 'LATITUDE', 'FLOOR', 'SPACEID','RELATIVEPOSITION', 'USERID', 'PHONEID', 'TIMESTAMP'])

Tdata_Building.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520,BUILDINGID
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,1
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,1
2,100,100,100,100,100,100,100,-97,100,100,...,100,100,100,100,100,100,100,100,100,1
3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,1
4,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,0


## 1.2 Dummify

In this case it's not needed, because i'm using all numerical variables to predict a categorical variable. 

## 1.3 Sample Data

As there is a lot of data, is a good idea to fail faster with a reduced proportion of the data

In [6]:
sample =  Tdata_Building

In [7]:
Tdata_Building = sample.sample(frac =.45, random_state= 200)

print("Len of the DF :" + str(len(Tdata_Building)))

#Tdata_Building.head()

Len of the DF :8972


## 1.4 X/y Split

In [8]:
X_building = Tdata_Building.iloc[:,0:520]
y_building = Tdata_Building.iloc[:,520:521]

X_building.head() # check predictors

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520
998,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
2013,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
1665,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
6188,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,-83,-85,100,100,100
2671,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


In [9]:
y_building.head() # check predicted

Unnamed: 0,BUILDINGID
998,1
2013,2
1665,2
6188,2
2671,2


## 1.5 Multicolinearity

For reference, the goal of regression is to isolate the relationship between each independent variable and the dependent variable. Multicollinearity weakens the statistical power of your model, thus leaving you unable to trust the p-values identifying which independent variables are statistically significant. In summary, multicollinearity won’t let you know the true effect of each variable.

### Calculate the Variance Inflation Factor (VIF).
VIF measures the collinearity among independent variables within a regression model. Then remove the multicorrelated features. 

https://kaiserm.medium.com/how-to-tackle-multicollinearity-79afe58e9479

In [10]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


# Compute VIF
vif = pd.DataFrame()
vif["variables"] = X_building.columns
vif["VIF"] = [variance_inflation_factor(X_building.values, i) for i in range(X_building.shape[1])]

vif


  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


Unnamed: 0,variables,VIF
0,WAP001,1.238410
1,WAP002,1.593780
2,WAP003,0.000000
3,WAP004,0.000000
4,WAP005,1.471728
...,...,...
515,WAP516,3.302083
516,WAP517,2.455898
517,WAP518,2.079820
518,WAP519,1.134106


### create a list with the features that exceeds the VIF treshold value
The [documentation](https://www.statsmodels.org/stable/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html) states that an independent variable is highly collinear with other independent variables when VIF > 5, and the parameter estimates will have large standard errors because of this.

In [11]:
count = 0
VIF_blackList = []

correlated_columns = []
for i in range(len(vif)):
    if vif.loc[i,"VIF"] >= 5:
        VIF_blackList.append(vif.loc[i,'variables'])
#        print(vif.loc[i,'variables'],vif.loc[i,'VIF'] )
        count += 1
print("Wap's with a VIF >= 5  -> " + str(count))

# uncomment to see the list of features to remove
#VIF_blackList

Wap's with a VIF >= 5  -> 85


### remove from the VIF_blackList

In [12]:
X_building = X_building.drop(columns = VIF_blackList)
X_building.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP509,WAP510,WAP511,WAP512,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520
998,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
2013,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
1665,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
6188,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,-83,-85,100,100,100
2671,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


# Save TRAIN PreProcessed Data

In [13]:
path = '/home/ale/Dropbox/UBIQUM/4.DeepAnalytics&Visualization/T4M3.WiFiLocationing/Data/'
file_X = "1_DataPrepro_Building_X.csv"
file_y = "1_DataPrepro_Building_y.csv"

X_building.to_csv(path + file_X, index = False)
y_building.to_csv(path + file_y, index = False)

# Preprocess VALIDATION Data

In [14]:
# Load File
path = '/home/ale/Dropbox/UBIQUM/4.DeepAnalytics&Visualization/T4M3.WiFiLocationing/Data/UJIndoorLoc/'
file = 'TestData.csv'

Val_data = pd.read_csv(path + file)

# make a copy of the dataset, keeping only the predictor/predicted features
Vdata_Building = Val_data.drop(columns = ['LONGITUDE', 'LATITUDE', 'FLOOR', 'SPACEID','RELATIVEPOSITION', 'USERID', 'PHONEID', 'TIMESTAMP'])

# X/y split   
X_Vd_building = Vdata_Building.iloc[:,0:520] 
y_Vd_building = Vdata_Building.iloc[:,520:521]

# same process than the training set. Must drop the same features from the VIFblacklist
X_Vd_building = X_Vd_building.drop(columns = VIF_blackList)

# no need to dummify

#print(y_Vd_building.head())
#X_Vd_building.head()

In [15]:
# save files
path = '/home/ale/Dropbox/UBIQUM/4.DeepAnalytics&Visualization/T4M3.WiFiLocationing/Data/'
file_X = "1_Val_DataPrepro_Building_X.csv"
file_y = "1_Val_DataPrepro_Building_y.csv"

X_Vd_building.to_csv(path + file_X, index = False)
y_Vd_building.to_csv(path + file_y, index = False)

# Load TRAIN PreProcessesed Data

In [16]:
path = '/home/ale/Dropbox/UBIQUM/4.DeepAnalytics&Visualization/T4M3.WiFiLocationing/Data/'
file_X = "1_DataPrepro_Building_X.csv"
file_y = "1_DataPrepro_Building_y.csv"

Saved_X = pd.read_csv(path + file_X)
Saved_y = pd.read_csv(path + file_y)

print(Saved_y.head())
Saved_X.head()



   BUILDINGID
0           1
1           2
2           2
3           2
4           2


Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP509,WAP510,WAP511,WAP512,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
2,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,-83,-85,100,100,100
4,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


## 1.6 Train/Test Split

In [17]:
# define X and y
X = Saved_X
y = Saved_y

# define train/test_ratio
train_ratio = 0.75
test_ratio = 0.25

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1-train_ratio) )

# check dimensionsc
print('DF: ', Tdata_Building.shape)
print('X:', X.shape,'y:', y.shape)
print('X_train:', X_train.shape,'y_train:', y_train.shape)
print('X_test: ', X_test.shape,' y_test: ', y_test.shape)

DF:  (8972, 521)
X: (8972, 435) y: (8972, 1)
X_train: (6729, 435) y_train: (6729, 1)
X_test:  (2243, 435)  y_test:  (2243, 1)


## 1.6 Models

### 1.6.1 k-NN Classifier

also check radiusKnn, says it's better for large datasets https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsClassifier.html#sklearn.neighbors.RadiusNeighborsClassifier

In [18]:
# define an empty dict to store the performance of different values of K
knn_performance = {}

for k in range(1,10):
    knn = KNeighborsClassifier(k, weights='distance')
    # train
    knn.fit(X_train, y_train.values.ravel())
    #get predictions
    knn_pred = knn.predict(X_test)
    # calculate the performance
    knn_accuracy = round(accuracy_score(y_test, knn_pred), 4)
    # store the performance in a dict
    knn_performance[k] = knn_accuracy

    
knn_performance_sorted = sorted(knn_performance.items(), key=lambda item:item[1], reverse=True)

# shows the full list sorted
#for i in knn_performance_sorted: print("k = " + str(i[0]) + " , accuracy: " + str(i[1]))

# shows only the best result
print("k = " + str(knn_performance_sorted[0][0]) + " , accuracy (on test set): " + str(knn_performance_sorted[0][1]))

k = 1 , accuracy (on test set): 0.9947


### save k-NN
In machine learning, while working with scikit learn library, we need to save the trained models in a file and restore them in order to reuse it to compare the model with other models, to test the model on a new data. The saving of data is called **Serialization**, while restoring the data is called **Deserialization**.
Also, we deal with different types and sizes of data. Some datasets are easily trained i.e- they take less time to train but the datasets whose size is large (more than 1GB) can take very large time to train on a local machine even with GPU. When we need the same trained data in some different project or later sometime, to avoid the wastage of the training time, store trained model so that it can be used anytime in the future. 
There are two ways we can save a model in scikit learn:
 
**Pickled** model as a file using joblib: Joblib is the replacement of pickle as it is more efficient on objects that carry large numpy arrays. These functions also accept file-like object instead of filenames.
     

- joblib.dump to serialize an object hierarchy 
- joblib.load to deserialize a data stream

In [19]:
file = "Building_KNN.joblib"
model = knn

save_model(model, file)

Model Saved


### Load k-NN

In [20]:
file = 'Building_KNN.joblib'

KNN_from_joblib = load_model(file)

# Use the loaded model to make predictions
KNN_predictions = KNN_from_joblib.predict(X_test)

print(accuracy_score(y_test, KNN_predictions))

0.990191707534552


### 1.6.2 Random Forest Classifier

In [21]:
RF = RandomForestClassifier()

# print the full list of parametrizable parameters
#RF.get_params(deep=True)

In [22]:
#Setup the parameters and distributions to sample from: param_dist
cv = 10
n_jobs = -1
param_dist = {#'bootstrap': True, 
              #'ccp_alpha': 0.0,
              #'criterion': 'mse',
              'n_estimators': [10,12,20], # numers of trees
              'max_features': ['auto'],# 'sqrt', 'log2'], # max number of features to consider at every split
              'max_depth':[30, 60, 90], # max number of levels in tree
              'min_samples_split':[2,4],  # min number of samples required to split a node
              'min_samples_leaf': [2,4]  # min number of samples required at each leaf node
              #'max_leaf_nodes': None,
              #'max_samples': None,
              #'min_impurity_decrease': 0.0, 
              #'min_impurity_split': None, 
              #'min_weight_fraction_leaf': 0.0,           
              #'n_jobs': None, 
              #'oob_score': False, 
              #'random_state': None,
              #'verbose': 0, 
              #'warm_start': False
             }

# instantiate the RandomizedSearchCV object: model_cv
#RF_RSCV = RandomizedSearchCV(RF, param_dist, cv=cv, n_jobs = n_jobs)
RF_GSCV = GridSearchCV(RF, param_dist, cv=cv, n_jobs = n_jobs)

In [23]:
# Fit it to the data
RF_GSCV.fit(X_train,y_train.values.ravel())

# Print the tuned parameters and score
print(" - Grid Search CV - ")
print("Tuned Decision Tree Parameters: {}".format(RF_GSCV.best_params_))
print("Tuned Decision Tree Best Score: {}".format(RF_GSCV.best_score_))

 - Grid Search CV - 
Tuned Decision Tree Parameters: {'max_depth': 90, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 20}
Tuned Decision Tree Best Score: 0.9974733336871152


In [24]:
# Make Predictions
RF_predictions = RF_GSCV.predict(X_test)

# Evaluate Predictions
print("Accuracy LR: {}".format(accuracy_score(y_test, RF_predictions)))
print(confusion_matrix(y_test, RF_predictions))
print(classification_report(y_test, RF_predictions))

Accuracy LR: 0.9950958537672759
[[ 606    0    0]
 [   1  582    1]
 [   0    9 1044]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       606
           1       0.98      1.00      0.99       584
           2       1.00      0.99      1.00      1053

    accuracy                           1.00      2243
   macro avg       0.99      1.00      1.00      2243
weighted avg       1.00      1.00      1.00      2243



### Save RF

Joblib is an efficient object operator that carry large numpy arrays. These functions also accept file-like object instead of filenames.
     

    joblib.dump to serialize an object hierarchy 
    joblib.load to deserialize a data stream

In [25]:
file = 'Building_RF.joblib'
model = RF_GSCV

save_model(model, file)

Model Saved


### Load k-NN

In [26]:
file = 'Building_RF.joblib'

# Load the model from the file
RF_from_joblib = load_model(file)

# Use the loaded model to make predictions
RF_predictions =  RF_from_joblib.predict(X_test)

print(accuracy_score(y_test, RF_predictions))

0.9950958537672759


### 1.6.3 Log Regression

In [27]:
model = LogisticRegression()

# print the full list of parametrizable parameters
# LogReg.get_params(deep=True)

#Setup the parameters and distributions to sample from: param_dist
cv = 10
n_jobs = -1 # uses all the processors
param_dist_LR = {#'C': 1.0,
              #'class_weight': None,
              #'dual': False,
              #'fit_intercept': True,
              #'intercept_scaling': 1,
              #'l1_ratio': None,
              'max_iter' : [600, 800, 900],
              #'multi_class': 'auto',
              #'n_jobs': n_jobs,
              'penalty': ['none'], #'elasticnet', 'l1', 'l2', 
              #'random_state': random_state,
              'solver': ['saga'], # 'sag', 
              #'tol': 0.0001,
              #'verbose': 0,
              'warm_start': ["False"] # "True", 
}

#LR_RSCV = RandomizedSearchCV(model, param_dist_LR, cv=cv, n_jobs=n_jobs)
LR_GSCV = RandomizedSearchCV(model, param_dist_LR, cv=cv, n_jobs=n_jobs)

In [28]:
# Fit it to the data
LR_GSCV.fit(X_train,y_train.values.ravel())

# Print the tuned parameters and score
print("Tuned Decision LogReg Parameters: {}".format(LR_GSCV.best_params_))



Tuned Decision LogReg Parameters: {'warm_start': 'False', 'solver': 'saga', 'penalty': 'none', 'max_iter': 600}




In [29]:
# Make Predictions
LR_predictions = LR_GSCV.predict(X_test)

# Evaluate Predictions
print("Accuracy LR: {}".format(accuracy_score(y_test, LR_predictions)))
print(confusion_matrix(y_test, LR_predictions))
print(classification_report(y_test, LR_predictions))

Accuracy LR: 0.9959875167186804
[[ 606    0    0]
 [   0  584    0]
 [   0    9 1044]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       606
           1       0.98      1.00      0.99       584
           2       1.00      0.99      1.00      1053

    accuracy                           1.00      2243
   macro avg       0.99      1.00      1.00      2243
weighted avg       1.00      1.00      1.00      2243



### save LR

Joblib is an efficient object operator that carry large numpy arrays. These functions also accept file-like object instead of filenames.
     

    joblib.dump to serialize an object hierarchy 
    joblib.load to deserialize a data stream

In [30]:
file = 'Building_LR.joblib'
model = LR_GSCV

save_model(model, file)

Model Saved


### Load k-NN

In [31]:
file = 'Building_LR.joblib'

#Load the model from the file
LR_from_joblib = load_model(file)

# Use the loaded model to make predictions
LR_predictions = LR_from_joblib.predict(X_test)

print(accuracy_score(y_test, LR_predictions))


0.9959875167186804
