# Guessing the Number: Linear Regression

In [1]:
print(type(lambda x: x+1))

<class 'function'>


## » Defining the family of linear models

## » Using more variables

### • importing the boston dataset

In [1]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import scale
boston = load_boston()
x = scale(boston.data)
y = boston.target

### • importing LinearRegression Class

In [2]:
from sklearn.linear_model import LinearRegression
# normalize para. will normalize regressor (x) before regression
regression = LinearRegression(normalize=True)
regression.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

### • calculating the model score (R^2)

In [3]:
print(regression.score(x, y))

0.7406426641094095


### • showing the coefficients

In [4]:
print([a + ':' + str(round(b, 2)) for a, b in zip(
    boston.feature_names, regression.coef_)])

['CRIM:-0.93', 'ZN:1.08', 'INDUS:0.14', 'CHAS:0.68', 'NOX:-2.06', 'RM:2.67', 'AGE:0.02', 'DIS:-3.1', 'RAD:2.66', 'TAX:-2.08', 'PTRATIO:-2.06', 'B:0.85', 'LSTAT:-3.74']


## » Understanding limitations and problems

# Moving to Logistic Regression

## » Applying logistic regression

### • importing Iris datasets

In [43]:
from sklearn.datasets import load_iris
import pandas as pd
iris = load_iris()
# exclude one entry to be tested by the model
x = iris.data[:-1,:]
y = iris.target[:-1]

### • Using the LogisticRegression algorithm

In [48]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(x, y)
# assign the class prediction of the excluded entry
# reshape() is only to make the shape of the return right
single_row_pred = logistic.predict(
    iris.data[-1, :].reshape(1, -1))
# assign each class prob. based on the model from the excluded entry
single_row_pred_prob = logistic.predict_proba(
    iris.data[-1, :].reshape(1, -1))
print(f'Predicted class {single_row_pred}')
print(f'Real class {iris.target[-1]}')
print('Probabilities for each class from 0 to 2:')
print((single_row_pred_prob))

Predicted class [2]
Real class 2
Probabilities for each class from 0 to 2:
[[4.89083792e-04 2.45507030e-01 7.54003886e-01]]


## » Considering when classes are more

### • Loading data and placing it into variables

In [49]:
from sklearn.datasets import load_digits
digits = load_digits()
# splits the data into train data and test data
train = range(0, 1700)
test = range(1700, len(digits.data))
# creates train data and test data
x = digits.data[train]
y = digits.target[train]
tx = digits.data[test]
ty = digits.target[test]

### • applying 2 class prediction strategies into logistic regression algorithm

In [52]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
# applying the multiclass classifiers to the train data
# max_iter para. used because the default max_iter is not
# enough
OVR = OneVsRestClassifier(LogisticRegression(max_iter=1000)).fit(x, y)
OVO = OneVsOneClassifier(LogisticRegression(max_iter=1000)).fit(x, y)
print(f'One Vs Rest accuracy: {OVR.score(tx, ty):.3f}')
print(f'One Vs One accuracy: {OVO.score(tx, ty):.3f}')

One Vs Rest accuracy: 0.959
One Vs One accuracy: 0.979
logre accuracy: 0.979


# Making Things as Simple as Naïve Bayes 

## » Finding out that Naïve Bayes isn't so naïve

## » Predicting text classifications

### • Importing 20newsgroups dataset

In [9]:
from sklearn.datasets import fetch_20newsgroups
# imports the newsgroups dataset with its subset
# for training and testing
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers',
                                              'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', 
                                     remove=('headers', 'footers',
                                             'quotes'))

### • Importing the 2 the two Naïve Bayes models and instantiate (assign) them

In [10]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
# instantiates the naive bayes model
# alpha para. used to ensure there will be
# no zero probabolities for rare features
Bernoulli = BernoulliNB(alpha=0.01)
Multinomial = MultinomialNB(alpha=0.01)

### • using hashing trick to prevent encountering new words

In [11]:
import sklearn.feature_extraction.text as txt
# using hashing trick to model the data so it fits
# with the niave bayes model
multinomial = txt.HashingVectorizer(stop_words='english',
                                    binary=False, norm=None)
binary = txt.HashingVectorizer(stop_words='english',
                                    binary=True, norm=None)

### • training the 2 classifiers and test them

In [12]:
import numpy as np
from sklearn.metrics import accuracy_score
# seperates the train and test data
data = newsgroups_train.data
data_test = newsgroups_test.data
target = newsgroups_train.target
target_test = newsgroups_test.target
# applies the hashing trick to the data
# np.abs is used because some of the data is negative
multi_x = np.abs(multinomial.transform(data))
multi_xt = np.abs(multinomial.transform(data_test))
bin_x = binary.transform(data)
bin_xt = binary.transform(data_test)
# fits the model into the data and its target
Multinomial.fit(multi_x, target)
Bernoulli.fit(bin_x, target)

# makes a for loop to print the accuracy of each
# Naive bayes model
for name, model, data in [('BernoulliNB', Bernoulli, bin_xt),
                         ('MultinomialNB', Multinomial, multi_xt)]:
    accuracy = accuracy_score(y_true=target_test,
                              y_pred=model.predict(data))
    print(f'Accuracy for {name}: {accuracy:.3f}')

Accuracy for BernoulliNB: 0.570
Accuracy for MultinomialNB: 0.651


### • revealing useful text statistics

In [13]:
print(f'Number of posts in training: {len(newsgroups_train.data)}')
# makes a dictionary which consist of of a key word and its value
# generate by the for loop to distinct every word in the post (data)
# by split(), and will return True for every word found
D = {word:True for post in newsgroups_train.data for word in post.split(' ')}
print(f'Number of distinct words in training {len(D)}')
print(f'Number of post in the test: {len(newsgroups_test.data)}')

Number of posts in training: 11314
Number of distinct words in training 300972
Number of post in the test: 7532


# Learning Lazily with Nearest Neighbors

## » Predictiong after observing neighbors

### • using KNN on digit dataset

In [54]:
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
digits = load_digits()
train = range(0, 1700)
test = range(1700, len(digits.data))
# uses PCA to decrease the dimentionality
# with 25 components
pca = PCA(n_components = 25)
# fits PCA to the train data
pca.fit(digits.data[train])
var_ratio = pca.explained_variance_ratio_
print(sum(var_ratio))

# transforms each data train and test with PCA
x = pca.transform(digits.data[train])
y = digits.target[train]
tx = pca.transform(digits.data[test])
ty = digits.target[test]

0.933306273534615


### • performing KNN algorithm 

In [55]:
from sklearn.neighbors import KNeighborsClassifier
# n_neighbors para is the k that will be explain later
# p para. is the distance measure method used
kNN = KNeighborsClassifier(n_neighbors=5, p=2)
kNN.fit(x, y);

### • evaluating the result

In [56]:
print(f'Accuracy: {kNN.score(tx, ty):.3f}')
print(f'Prediction : {kNN.predict(tx[-15:,:])}')
print(f'Actual     : {ty[-15:]}')

Accuracy: 0.990
Prediction : [2 2 5 7 9 5 4 8 1 4 9 0 8 9 8]
Actual     : [2 2 5 7 9 5 4 8 8 4 9 0 8 9 8]


## Choosing your k parameter wisely 

### • Experimenting with k value

In [57]:
for k in [1, 5, 10, 50, 100, 200]:
    # tries different values for k
    kNN = KNeighborsClassifier(n_neighbors=k).fit(x, y)
    print(f'For k = {k} accuracy is {kNN.score(tx, ty):.3f}')

For k = 1 accuracy is 0.979
For k = 5 accuracy is 0.990
For k = 10 accuracy is 0.969
For k = 50 accuracy is 0.959
For k = 100 accuracy is 0.959
For k = 200 accuracy is 0.907
