In [1]:
#Load libraries
import pandas as pd
import numpy as np
from numpy import linalg as LA
from scipy import stats # ANOVA
import re
import matplotlib.pyplot as plt
from PIL import Image
from sklearn import *
from sklearn import svm
from sklearn.model_selection import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
import time


# 1.1 Load Boston Housing Dataset
Note- The data is numeric; I printed X and there is no non-numeric data in it; it's also free of MEDV

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()
X = boston.data
y = boston.target

# 1.2 Methods to predict MEDV 
## Cover method name, str, weakness, and hyperparams
## Note- methods like SVM are NOT meant for predicting data, but rather classifying it. Also, LogisticRegression is ONLY for classifying data in Python; LinearRegression has no paramaters that can be tweaked from sklearn

.

### Linear Regression
Models an output based on a linear combination of input data.
#### Advantages
* Works in high dimensional data (many columns)
* Works fast, and has results/params that are easy to understand
#### Disadvantages
* Is sensitive to outliers
* Non-linear relationships require feature-engineering to fix
#### Hyperparameter
* (for logistic) C for regularization strength- lower C = more underfitting
* (for logistic) method- what scalar algorithm to use
* (theoretical; sklearn doesn't allow tuning this for prediction data)- tolerance/ease of it- how much will the algorithm will weigh points near/across the regression line

.

### kNN
For a given set of testing data; it looks at which points in the training set are the closest match to it and assigns the target for the training point one with the closest match.
#### Advantages
* It's fast to set up; no training cost (instance based learning)
* Easy to understand results
* Flexible; can deal with overfitting by changing K
#### Disadvantages
* Does not work in high dimensional spaces
#### Hyperparameter
* k- The numbers of neighbours to consider. For instance- if k=1- only the nearest point is considered; if k=5- 5 neighbours are considered and the ones which appear the most are considered.

.

### Decision Trees
Splits the data into categories based on a feature value to try and maximize information gain.
#### Advantages
* easy to understand results
* uses decision making; can become arbitrary complexity
* can be made compact through pruning
#### Disadvantages
* sometimes hard to interpret
#### Hyperparameter
* (for classification) critereon- function to measure how to split
* max_features- number of features to consider when looking for the best split

.

### Regularization (Ridge/Lasso)
Adding/Subtracting features and weighing them on how well they help predict the target feature. Assign weights/drop feature based on how well they help predict.
#### Advantages
* Lasso removes variables that have little to no correlation/impact on our outcome variable; wheras Ridge should bring it close to zero- both help regularization scale across high dimensional data by considering features that help predict the target feature. 
* Lasso removes can remove features and reduce our model complexity and computational cost
#### Disadvantages
* Models sometimes hard to interpret
#### Hyperparameter
* Alpha- multiplies the L1 term (tbh- not sure on what it does very well) for Lasso, Ridge.

.

## SVM can work (it has something for regression and predicting continuous data), but it's best suited for classification for categorical data than regression for continous data and will hence not be considered for the sake of this exercise.

### SVM
#### Advantages
* Is more flexible than Logistic/Linear Regression
* Kernels engineer features automtically
* Describes complex decision boundaries well
* Only cases than violate the margin have influence- which can be adjusted by budgeting
* Works well in non-seperable cases unlike classifiers that try and maximize the margins
#### Disadvantages
* Doesn't work very well in high dimensional spaces
* It's still inherently a linear classifier
#### Hyperparameter
* Kernels- What SVM algorithm to use and classify the data



# 1.3 Predict MEDV

In [3]:
#Normalized data Xn
scaler = preprocessing.StandardScaler()
Xn = scaler.fit_transform(X)

In [4]:
#r2 taken from https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
#df sorting https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html

#Arrays to store data for each test

#model name
model = []

#parameter used for the given model
param = []

#whether the data used is normalized or not
normalized = []

#r2 and rmse scores
r2 = []
mse = []

xArr = [X, Xn]
xArrPrint = [0, 1]

#for normalized, count data
    #for different features for a given model
        #calculate avg rmse, mse for k=v cross val
        #append to array
    
for i in range(len(xArr)):
    #Lasso
    for iAlpha in list(np.round(np.linspace(0, 1, 6),6)):
        r2.append(np.mean(cross_val_score(Lasso(alpha=iAlpha), X=xArr[i], y=y, scoring='r2', cv=5)))
        mse.append(np.mean(cross_val_score(Lasso(alpha=iAlpha), X=xArr[i], y=y, scoring='neg_mean_squared_error', cv=5)))
        normalized.append(xArrPrint[i])
        param.append("alpha:" + str(np.round(iAlpha, 5)))
        model.append("Lasso")

    #Ridge
    for iAlpha in list(np.round(np.linspace(0, 1, 6),6)):
        r2.append(np.mean(cross_val_score(Ridge(alpha=iAlpha), X=xArr[i], y=y, scoring='r2', cv=5)))
        mse.append(np.mean(cross_val_score(Ridge(alpha=iAlpha), X=xArr[i], y=y, scoring='neg_mean_squared_error', cv=5)))
        normalized.append(xArrPrint[i])
        param.append("alpha:" + str(np.round(iAlpha, 5)))
        model.append("Ridge")

    #kNN
    for iKnn in range(1,10):
        r2.append(np.mean(cross_val_score(KNeighborsRegressor(n_neighbors = iKnn), X=xArr[i], y=y, scoring='r2', cv=5)))
        mse.append(np.mean(cross_val_score(KNeighborsRegressor(n_neighbors = iKnn), X=xArr[i], y=y, scoring='neg_mean_squared_error', cv=5)))
        normalized.append(xArrPrint[i])
        param.append("# of Neighbours: " + str(iKnn))
        model.append("kNN")

    #Decision Trees
    for iFeature in ['sqrt', 'log2', 'auto']:
        r2.append(np.mean(cross_val_score(DecisionTreeRegressor(max_features = iFeature), X=xArr[i], y=y, scoring='r2', cv=5)))
        mse.append(np.mean(cross_val_score(DecisionTreeRegressor(max_features = iFeature), X=xArr[i], y=y, scoring='neg_mean_squared_error', cv=5)))
        normalized.append(xArrPrint[i])
        param.append("Feature: " + iFeature)
        model.append("Decision Tree")

    #Linear Regression
    r2.append(np.mean(cross_val_score(LinearRegression(), X=xArr[i], y=y, scoring='r2', cv=5)))
    mse.append(np.mean(cross_val_score(LinearRegression(), X=xArr[i], y=y, scoring='neg_mean_squared_error', cv=5)))
    normalized.append(xArrPrint[i])
    param.append("N/A")
    model.append("LogisticRegression")

mse = [ np.sqrt(-x) for x in mse]
dfBostonModels = pd.DataFrame({'classifier':model, 'parameter':param, 'normalized':normalized, 'r2':r2, 'rmse':mse})

  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_

# 1.4 Summary of results

In [5]:
#metrics.SCORERS.keys()
dfBostonModels.sort_values(by=['r2'], ascending=False)

Unnamed: 0,classifier,parameter,normalized,r2,rmse
45,kNN,# of Neighbours: 9,1,0.540138,5.323236
44,kNN,# of Neighbours: 8,1,0.527135,5.370247
43,kNN,# of Neighbours: 7,1,0.511286,5.402913
42,kNN,# of Neighbours: 6,1,0.50305,5.425458
41,kNN,# of Neighbours: 5,1,0.493679,5.436011
40,kNN,# of Neighbours: 4,1,0.491846,5.492649
39,kNN,# of Neighbours: 3,1,0.457022,5.604815
4,Lasso,alpha:0.8,0,0.434296,5.932281
5,Lasso,alpha:1.0,0,0.431849,5.960837
3,Lasso,alpha:0.6,0,0.43136,5.910723


# 1.5 Discussion
* r-squared, or r2 measures the proportion of variance that is explained by the independent variables; with higher being better. It can be interpreted as how close the data is to the fitted model.

* rmse is a number that is the root mean square error; it's the squared differences between predicted to actual values for a model that then have their square root taken. Lower is better- with 0 being no difference between predicted and actual values (perfect prediction)

The kNN model with normalized data dominated- having the highest r-squared and lowest rmse value among all the models. If I had to take a blind guess as to why it performed the best- it'd probably be because housing generally follows some trends- for instance; housing that's nearer a lake or water body is generally more expensive than those that are landlocked; those with higher crime rates generally have lower prices than those that are not; or those that are newer are more expensive because of newer features like broader roads and updated utilities. Given this, a newly established town that is close to a river is likely to have housing values similar or close to those that are also newly established and close to a river. Given that similar neighbourhoods will probably have similar prices, kNN is likely to perform well here.

kNN performs well because of the nature of this dataset- in that properties that share features are likely to be priced similarly. It's also likely given the dataset had many (13) features, some of the other models like decision trees and regression overfitted on the data which caused them to perform badly.

To try and explain why normalized data performed really well for kNN, I found this explanation on Stackoverflow that really hit the mark-
https://stats.stackexchange.com/questions/287425/why-do-you-need-to-scale-data-in-knn

> Suppose you had a dataset (m "examples" by n "features") and all but one feature dimension had values strictly between 0 and 1, while a single feature dimension had values that range from -1000000 to 1000000. When taking the euclidean distance between pairs of "examples", the values of the feature dimensions that range between 0 and 1 may become uninformative and the algorithm would essentially rely on the single dimension whose values are substantially larger.

# [[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[
# [[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[
# [[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[
# [[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[
# [[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[

In [6]:
#Read in dataset

#https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()

# 2.1 Load dataset; split into train/test/validation
## Note- Imported data from sklearn; so results may be different.

In [7]:
#Set X, Y
#https://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_logistic_regression_20newsgroups.html
X = data.data
y = data.target

#https://github.com/javedsha/text-classification/blob/master/Text%2BClassification%2Busing%2Bpython%2C%2Bscikit%2Band%2Bnltk.ipynb
#Convert to count
count_vect = CountVectorizer()
X_c = count_vect.fit_transform(X)

#Convert to TF-IDF
tfidf_transformer = TfidfTransformer()
X_t = tfidf_transformer.fit_transform(X_c)

# 2.2 Convert text into BOW; find best k for kNN through CV

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state = 1, stratify = y, test_size = 0.2)

print("")
print("kNN parameters comparison:")
for i in range(1,10):
    start_time = time.time()
    knnModel = KNeighborsClassifier(n_neighbors = i, metric='cosine')
    acc = cross_val_score(knnModel, X_train, y_train, cv=5)
    print("    k:", i, "Time:",str(round(time.time() - start_time,2)), "Acc:", np.round(  np.mean(acc)  , 2))


kNN parameters comparison:
    k: 1 Time: 5.24 Acc: 0.64
    k: 2 Time: 5.2 Acc: 0.6
    k: 3 Time: 5.33 Acc: 0.58
    k: 4 Time: 5.57 Acc: 0.56
    k: 5 Time: 5.6 Acc: 0.54
    k: 6 Time: 6.01 Acc: 0.53
    k: 7 Time: 6.85 Acc: 0.53
    k: 8 Time: 6.56 Acc: 0.52
    k: 9 Time: 6.31 Acc: 0.51


In [None]:
#neighbors.VALID_METRICS['brute']

# 2.3 Repeat 2.2 with TF-IDF

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_t, y, random_state = 1, stratify = y, test_size = 0.2)

print("")
print("kNN parameters comparison:")
for i in range(1,10):
    start_time = time.time()
    knnModel = KNeighborsClassifier(n_neighbors = i, metric='cosine')
    acc = cross_val_score(knnModel, X_train, y_train, cv=5)
    print("    k:", i, "Time:",str(round(time.time() - start_time,2)), "Acc:", np.round(  np.mean(acc)  , 2))


kNN parameters comparison:
    k: 1 Time: 5.0 Acc: 0.8
    k: 2 Time: 5.04 Acc: 0.77
    k: 3 Time: 5.14 Acc: 0.77
    k: 4 Time: 5.3 Acc: 0.76
    k: 5 Time: 5.44 Acc: 0.76
    k: 6 Time: 6.51 Acc: 0.76
    k: 7 Time: 6.49 Acc: 0.75
    k: 8 Time: 6.28 Acc: 0.75
    k: 9 Time: 6.11 Acc: 0.75


# 2.4 Rpeat 2.2 with NB; finding best smoothing param

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state = 1, stratify = y, test_size = 0.2)

print("")
print("BOW Count NB smoothing comparison:")
for i in list(np.round(np.linspace(0, 1, 11),6)):
    start_time = time.time()
    nb = MultinomialNB(alpha = i)
    acc = cross_val_score(nb, X_train, y_train, cv=5)
    print("    alpha:", i, "Time:",str(round(time.time() - start_time,2)), "Acc:", np.round(  np.mean(acc)  , 2))
    
X_train, X_test, y_train, y_test = train_test_split(X_t, y, random_state = 1, stratify = y, test_size = 0.2)

print("")
print("TF-IDF NB smoothing comparison:")
for i in list(np.round(np.linspace(0, 1, 11),6)):
    start_time = time.time()
    nb = MultinomialNB(alpha = i)
    acc = cross_val_score(nb, X_train, y_train, cv=5)
    print("    alpha:", i, "Time:",str(round(time.time() - start_time,2)), "Acc:", np.round(  np.mean(acc)  , 2))


BOW Count NB smoothing comparison:


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


    alpha: 0.0 Time: 0.68 Acc: 0.87
    alpha: 0.1 Time: 0.71 Acc: 0.87
    alpha: 0.2 Time: 0.63 Acc: 0.87
    alpha: 0.3 Time: 0.73 Acc: 0.86
    alpha: 0.4 Time: 0.73 Acc: 0.85
    alpha: 0.5 Time: 0.73 Acc: 0.85
    alpha: 0.6 Time: 0.77 Acc: 0.84
    alpha: 0.7 Time: 0.74 Acc: 0.83
    alpha: 0.8 Time: 0.76 Acc: 0.83
    alpha: 0.9 Time: 0.76 Acc: 0.82
    alpha: 1.0 Time: 0.74 Acc: 0.82

TF-IDF NB smoothing comparison:


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


    alpha: 0.0 Time: 0.72 Acc: 0.87
    alpha: 0.1 Time: 0.71 Acc: 0.89
    alpha: 0.2 Time: 0.77 Acc: 0.88
    alpha: 0.3 Time: 0.81 Acc: 0.87
    alpha: 0.4 Time: 0.76 Acc: 0.86
    alpha: 0.5 Time: 0.78 Acc: 0.85
    alpha: 0.6 Time: 0.73 Acc: 0.85
    alpha: 0.7 Time: 0.7 Acc: 0.84
    alpha: 0.8 Time: 0.73 Acc: 0.84
    alpha: 0.9 Time: 0.74 Acc: 0.83
    alpha: 1.0 Time: 0.72 Acc: 0.83


# 2.5 Discuss Findings
Naive Bayes took less than a 5th of the time to train and test than kNN did; and had a higher accuracy all round. Based on our training-validation tests; we can argue Naive Bayes better models our data than kNN does given this higher accuracy at a lower computational time.  kNN doesn't perform as well because there are too many words; aka- too many dimensions. Even mostly similar sentences are still really far away because a few words may not match or be contained in another sentence. 

Within Naive Bayes, we had the highest accuracy after converting our BOW to TF-IDF and choosing 0.1 as our smoothing parameter. For the last step, we will use naive bayes with alpha=0.1.

# 2.6 Analyze model on testing data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_t, y, random_state = 1, stratify = y, test_size = 0.2)

nb = MultinomialNB(alpha = 0.1).fit(X = X_train, y = y_train)
prediction = nb.predict(X_test)
print("    alpha:", 0.1, "Acc:", np.round(  (prediction == y_test).sum()/len(prediction)  , 2))

    alpha: 0.1 Acc: 0.9
