# Other Popular Machine Learning Methods

### Segment 1 - Association Rule Mining Using Apriori Algorithm

In [22]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

In [23]:
grocery = pd.read_csv('Ex_Files_Python_Data_Science_EssT_Pt2/Exercise Files/Data/groceries.csv')

In [24]:
grocery.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,citrus fruit,semi-finished bread,margarine,ready soups,,,,,
1,tropical fruit,yogurt,coffee,,,,,,
2,whole milk,,,,,,,,
3,pip fruit,yogurt,cream cheese,meat spreads,,,,,
4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,


Transaction data needs to be in sparse format for association rules, so we have to convert in.

## Data Conversion

In [25]:
basket_sets = pd.get_dummies(grocery)

In [26]:
basket_sets.head()

Unnamed: 0,1_Instant food products,1_UHT-milk,1_artif. sweetener,1_baby cosmetics,1_bags,1_baking powder,1_bathroom cleaner,1_beef,1_berries,1_beverages,...,9_sweet spreads,9_tea,9_vinegar,9_waffles,9_whipped/sour cream,9_white bread,9_white wine,9_whole milk,9_yogurt,9_zwieback
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Support Calculation

In [27]:
apriori(basket_sets, min_support=0.02) # get the item indexes that has the frequency of 0.2 or above.

Unnamed: 0,support,itemsets
0,0.030421,(7)
1,0.034951,(17)
2,0.029126,(23)
3,0.049191,(26)
4,0.064401,(47)
5,0.04466,(83)
6,0.024272,(90)
7,0.040453,(92)
8,0.038835,(99)
9,0.033981,(100)


In [28]:
apriori(basket_sets, min_support=0.02, use_colnames=True)

Unnamed: 0,support,itemsets
0,0.030421,(1_beef)
1,0.034951,(1_canned beer)
2,0.029126,(1_chicken)
3,0.049191,(1_citrus fruit)
4,0.064401,(1_frankfurter)
5,0.04466,(1_other vegetables)
6,0.024272,(1_pip fruit)
7,0.040453,(1_pork)
8,0.038835,(1_rolls/buns)
9,0.033981,(1_root vegetables)


The list is full of items but not combinations of items, so we lower the min_support to get more return data. And add in another column to see the number of items in each combinations. 

In [33]:
df = basket_sets

frequent_itemsets = apriori(df, min_support=0.002, use_colnames=True)

frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.006472,(1_UHT-milk),1
1,0.030421,(1_beef),1
2,0.011974,(1_berries),1
3,0.008414,(1_beverages),1
4,0.014887,(1_bottled beer),1
...,...,...,...
844,0.002265,"(5_other vegetables, 3_pip fruit, 6_whole milk)",3
845,0.002589,"(3_root vegetables, 5_whole milk, 4_other vege...",3
846,0.002913,"(4_curd, 5_yogurt, 3_whole milk)",3
847,0.003236,"(5_other vegetables, 6_whole milk, 4_root vege...",3


However, we are still seeing those one-item purchases. We try to filter them out here.

In [35]:
frequent_itemsets[frequent_itemsets['length'] >= 3]

Unnamed: 0,support,itemsets,length
820,0.002589,"(2_root vegetables, 3_other vegetables, 1_beef)",3
821,0.002589,"(1_chicken, 3_whole milk, 2_other vegetables)",3
822,0.002589,"(1_citrus fruit, 3_whole milk, 2_other vegetab...",3
823,0.003236,"(3_pip fruit, 1_citrus fruit, 2_tropical fruit)",3
824,0.002589,"(3_other vegetables, 1_citrus fruit, 4_whole m...",3
825,0.002265,"(5_other vegetables, 1_frankfurter, 6_whole milk)",3
826,0.002265,"(3_other vegetables, 1_pork, 4_whole milk)",3
827,0.00356,"(1_root vegetables, 3_whole milk, 2_other vege...",3
828,0.002589,"(3_soda, 1_sausage, 2_rolls/buns)",3
829,0.002265,"(3_other vegetables, 4_whole milk, 1_sausage)",3


## Association Rules

#### Confidence

The percentage of the item C to be purchased if the item A is purchased.

In [36]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(2_sausage),(1_frankfurter),0.011327,0.064401,0.011327,1.0,15.527638,0.010597,inf
1,(7_pastry),(1_frankfurter),0.005178,0.064401,0.002589,0.5,7.763819,0.002256,1.871197
2,(2_ham),(1_sausage),0.00712,0.076052,0.004531,0.636364,8.367505,0.003989,2.540858
3,(2_meat),(1_sausage),0.006796,0.076052,0.004854,0.714286,9.392097,0.004338,3.233819
4,(3_beef),(1_sausage),0.004854,0.076052,0.002589,0.533333,7.012766,0.00222,1.979889


#### Lift

How often does item A and item C occurs in the same transaction.

In [37]:
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(2_citrus fruit),(1_beef),0.028803,0.030421,0.005502,0.191011,6.278986,0.004625,1.198508
1,(1_beef),(2_citrus fruit),0.030421,0.028803,0.005502,0.180851,6.278986,0.004625,1.185618
2,(1_beef),(2_other vegetables),0.030421,0.0589,0.003236,0.106383,1.806173,0.001444,1.053136
3,(2_other vegetables),(1_beef),0.0589,0.030421,0.003236,0.054945,1.806173,0.001444,1.02595
4,(2_root vegetables),(1_beef),0.036893,0.030421,0.005502,0.149123,4.902016,0.004379,1.139506


#### Lift and Confidence (Optimal)

In [39]:
rules[(rules['lift'] >= 5) & (rules['confidence'] >= 0.5)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
93,(2_sausage),(1_frankfurter),0.011327,0.064401,0.011327,1.000000,15.527638,0.010597,inf
137,(7_pastry),(1_frankfurter),0.005178,0.064401,0.002589,0.500000,7.763819,0.002256,1.871197
238,(2_ham),(1_sausage),0.007120,0.076052,0.004531,0.636364,8.367505,0.003989,2.540858
243,(2_meat),(1_sausage),0.006796,0.076052,0.004854,0.714286,9.392097,0.004338,3.233819
258,(3_beef),(1_sausage),0.004854,0.076052,0.002589,0.533333,7.012766,0.002220,1.979889
...,...,...,...,...,...,...,...,...,...
959,"(5_other vegetables, 4_root vegetables)",(6_whole milk),0.005178,0.009385,0.003236,0.625000,66.594828,0.003188,2.641640
960,"(6_whole milk, 4_root vegetables)",(5_other vegetables),0.003883,0.012621,0.003236,0.833333,66.025641,0.003187,5.924272
964,"(5_other vegetables, 7_butter)",(6_whole milk),0.002589,0.009385,0.002265,0.875000,93.232759,0.002241,7.924919
966,"(7_butter, 6_whole milk)",(5_other vegetables),0.002913,0.012621,0.002265,0.777778,61.623932,0.002229,4.443204


### Segment 2 - Neural Network with a perceptron

In [75]:
import numpy as np
import pandas as pd
import sklearn

from pandas import Series, DataFrame
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [76]:
from sklearn.linear_model import Perceptron

In [77]:
bc = datasets.load_breast_cancer()
bc

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [78]:
X = bc.data
y = bc.target

X[0:10,]

array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, 1.203e+03, 1.096e-01, 1.599e-01,
        1.974e-01, 1.279e-01, 2.069e-01, 5.999e-02, 7.456e-01, 7.869e-01,
        4.585e+00, 9.403e+01, 6.150e-03, 4.006e-02, 3.832e-02, 2.058e-02,
        2.250e-02, 4.571e-03, 2.357e

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2)

In [80]:
#Normalize the data
standardize = StandardScaler()

standardized_X_test = standardize.fit_transform(X_test)
standardized_X_train = standardize.fit_transform(X_train)

In [81]:
standardized_X_test[0:10]

array([[-8.70033890e-02, -1.47192915e+00, -1.05373911e-01,
        -2.14796744e-01,  2.05627941e+00, -1.87598213e-01,
         4.34596852e-02,  3.43147297e-01,  4.86932208e-01,
         2.09714918e-01,  4.14837253e-01,  2.38110688e+00,
         5.38167210e-01,  1.89599284e-02,  9.51284468e-01,
         3.16783689e-01,  2.81890431e-01,  2.21465008e+00,
        -3.92766050e-01,  4.44859158e-01, -3.86348896e-01,
        -1.69650664e+00, -4.21900040e-01, -4.45574812e-01,
         2.30418212e-01, -7.55219020e-01, -6.01923711e-01,
        -2.66291739e-01, -1.09776353e+00, -6.55974588e-01],
       [ 1.99890922e-01,  3.57734227e-02,  1.70617903e-01,
         5.53159069e-02, -5.87425771e-01, -1.75255240e-01,
        -4.29667545e-01, -1.13156079e-01,  1.63461323e-01,
        -5.72971116e-01, -5.57505328e-01, -4.83058988e-01,
        -4.70073209e-01, -4.73389614e-01, -6.67093209e-01,
        -2.78187639e-01, -4.79507175e-01, -8.56640651e-02,
        -6.03244595e-01, -3.45331271e-01, -6.99717544e-

In [82]:
perceptron = Perceptron(max_iter=50, eta0=0.15, tol=1e-3, random_state=15)

In [83]:
perceptron.fit(standardized_X_train, y_train.ravel())

In [84]:
y_pred = perceptron.predict(standardized_X_test)

In [85]:
print(y_test)

[1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1
 0 1 1 0 0 1 1 0 1 1 1 0 0 1 0 1 1 1 0 1 1 0 1 1 0 1 0 0 1 0 0 1 0 0 0 1 0
 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 1 1 1 0 0 1 0 0 1 1 1 0 0 0 1 1 1 1 1 1 0
 0 0 0]


In [86]:
print(y_pred)

[1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1
 1 1 1 0 0 1 1 0 1 1 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 1 1 0 1 0
 1 0 1 1 1 0 0 0 0 1 1 0 1 1 1 0 1 1 1 0 0 1 0 0 1 1 1 0 0 0 1 1 1 1 1 1 0
 0 0 0]


In [89]:
print(classification_report(y_test, y_pred))
# we can get the prediction of the value of 0 is 0.93 and 0.94 for the value of 1.

              precision    recall  f1-score   support

           0       0.93      0.91      0.92        45
           1       0.94      0.96      0.95        69

    accuracy                           0.94       114
   macro avg       0.94      0.93      0.94       114
weighted avg       0.94      0.94      0.94       114



### Segment 3 - KNN

In [90]:
import numpy as np
import pandas as pd
import sklearn
import scipy
import urllib

import matplotlib.pyplot as plt
from pylab import rcParams

from sklearn.model_selection import train_test_split
from sklearn import neighbors, preprocessing, metrics

In [91]:
from sklearn.neighbors import KNeighborsClassifier

In [92]:
np.set_printoptions(precision=4, suppress=True)
%matplotlib inline
rcParams['figure.figsize'] = 7, 4
plt.style.use('seaborn-whitegrid')

In [93]:
car = pd.read_csv('Ex_Files_Python_Data_Science_EssT_Pt2/Exercise Files/Data/mtcars.csv')

In [95]:
car.columns = ['car_names', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am','gear',' carb']

X_prime = car[['mpg', 'disp', 'hp', 'wt']].values
y = car.iloc[:, 9].values

In [96]:
X = preprocessing.scale(X_prime)

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

In [99]:
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
print(clf)

KNeighborsClassifier()


In [100]:
y_pred = clf.predict(X_test)
y_expect = y_test

print(classification_report(y_expect, y_pred))

              precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       1.00      0.67      0.80         3

    accuracy                           0.86         7
   macro avg       0.90      0.83      0.84         7
weighted avg       0.89      0.86      0.85         7



Recall: a measure of the model's completeness
1. Of all the points that were labeled 1, only 67% of the results that were returned were truly relevant.
2. Of the entire dataset, 83% of the results that were returned were truly relevant.

High precision + low recall = few results returned, but many of the label predictions that are returned are correct.

### Segment 5 - Naive Bayes Classifiers

1. Multinomial: it's good when features are continuous or categorical and discribe discete frequency count. 

2. Bernoulli: it's good making predictions from binary features.

3. Gaussian: good for making predictions from normally distributed features.

Assumtions: predictors are independent of each other. It also holds a priori assumption just as **all regression models** : the past conditions still hold true; when we make predictions from historical values we will get incorrect results if present circumstances have changed.


In [101]:
import numpy as np
import pandas as pd
import urllib
import sklearn

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [102]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB

In [105]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data'

import urllib.request

raw_data = urllib.request.urlopen(url)
dataset = np.loadtxt(raw_data, delimiter=',')
print(dataset[0])

[  0.      0.64    0.64    0.      0.32    0.      0.      0.      0.
   0.      0.      0.64    0.      0.      0.      0.32    0.      1.29
   1.93    0.      0.96    0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.778   0.      0.
   3.756  61.    278.      1.   ]


In [106]:
X = dataset[:, 0:48]
y = dataset[:, -1]

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=17)

In [109]:
BernNB = BernoulliNB(binarize=True)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test
y_pred = BernNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

BernoulliNB(binarize=True)
0.8577633007600435


In [111]:
MultiNB = MultinomialNB()
MultiNB.fit(X_train, y_train)
print(MultiNB)

y_pred = MultiNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

MultinomialNB()
0.8816503800217155


In [112]:
GauNB = GaussianNB()
GauNB.fit(X_train, y_train)
print(GauNB)

y_pred = GauNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

GaussianNB()
0.8197611292073833


Try to improve model performance by tuning the parameters (trail and error)

In [113]:
BernNB = BernoulliNB(binarize=0.1)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test
y_pred = BernNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

BernoulliNB(binarize=0.1)
0.9109663409337676


### Segment 6 - Ensemble methods with random forest

In [114]:
import numpy as np
import pandas as pd

import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

In [124]:
diabetes = datasets.load_wine()

df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.DataFrame(diabetes.target)

y.columns = ['labels']

In [125]:
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [126]:
y[0:5]

Unnamed: 0,labels
0,0
1,0
2,0
3,0
4,0


In [127]:
df.isnull().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64

In [128]:
print(y.labels.value_counts())

1    71
0    59
2    48
Name: labels, dtype: int64


In [130]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=17)

In [131]:
rf_clf = RandomForestClassifier(n_estimators=200, random_state=0)

y_train_array = np.ravel(y_train)

rf_clf.fit(X_train, y_train_array)

y_pred = rf_clf.predict(X_test)

In [132]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00        16
           2       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [133]:
y_test_array = np.ravel(y_test)
print(y_test_array)

[1 1 1 0 2 2 0 1 2 0 2 0 1 1 2 2 2 1 1 1 1 1 0 2 1 1 0 1 2 1 2 2 0 0 2 1]


In [134]:
print(y_pred)

[1 1 1 0 2 2 0 1 2 0 2 0 1 1 2 2 2 1 1 1 1 1 0 2 1 1 0 1 2 1 2 2 0 0 2 1]
