# NAIVE BAYES CLASSIFIER

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC

### Dataset 1 -> 20 NewsGroups

##### Loading...

In [2]:
from sklearn.datasets import fetch_20newsgroups

# Load the dataset
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

df_train = pd.DataFrame({'text': newsgroups_train.data, 'target': newsgroups_train.target})
df_test = pd.DataFrame({'text': newsgroups_test.data, 'target': newsgroups_test.target})

In [3]:
df_train.head()

Unnamed: 0,text,target
0,I was wondering if anyone out there could enli...,7
1,A fair number of brave souls who upgraded thei...,4
2,"well folks, my mac plus finally gave up the gh...",4
3,\nDo you have Weitek's address/phone number? ...,1
4,"From article <C5owCB.n3p@world.std.com>, by to...",14


##### Preprocessing...

In [4]:
# Preprocess the data
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)

X_train = vectorizer.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target

X_test = vectorizer.transform(newsgroups_test.data)
y_test = newsgroups_test.target

##### Splitting and Evaluating...

In [5]:
# Train and evaluate the classifier using 4 test train splits
test_sizes = [0.1, 0.2, 0.3, 0.4]

for i in range(4):
    
    # Split the data into training and testing sets
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=test_sizes[i], random_state=7)
    
    # Train the classifier
    clf = MultinomialNB()
    clf.fit(X_train_split, y_train_split)
    
    # Evaluate the classifier on the testing set
    y_pred = clf.predict(X_test_split)
    accuracy = accuracy_score(y_test_split, y_pred)
    print(f"Accuracy for split {i+1} with test size {test_sizes[i]}: {accuracy:.3f}")

Accuracy for split 1 with test size 0.1: 0.750
Accuracy for split 2 with test size 0.2: 0.725
Accuracy for split 3 with test size 0.3: 0.727
Accuracy for split 4 with test size 0.4: 0.707


### Dataset 2 -> Breast Cancer

##### Loading...

In [6]:
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
cancer = load_breast_cancer()

In [7]:
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = pd.Series(cancer.target)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


##### Preprocessing...

In [8]:
scaler = StandardScaler()
X = scaler.fit_transform(cancer.data)
y = cancer.target

##### Splitting and Evaluating...

In [9]:
# Train and evaluate the classifier using 4 test train splits
test_sizes = [0.1, 0.2, 0.3, 0.4]

for i in range(4):
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_sizes[i], random_state=21)
    
    # Train the classifier
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    
    # Evaluate the classifier on the testing set
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for split {i+1} with test size {test_sizes[i]}: {accuracy:.3f}")

Accuracy for split 1 with test size 0.1: 0.912
Accuracy for split 2 with test size 0.2: 0.939
Accuracy for split 3 with test size 0.3: 0.947
Accuracy for split 4 with test size 0.4: 0.943


### Dataset 3 -> Iris

##### Loading...

In [10]:
from sklearn.datasets import load_iris

# Load the iris dataset
iris = load_iris()

df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = pd.Series(iris.target)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


##### Preprocessing...

In [11]:
X = iris.data
y = iris.target

scaler = StandardScaler()
X = scaler.fit_transform(X)

##### Splitting and Evaluating...

In [12]:
# Train and evaluate the classifier using 4 test train splits
test_sizes = [0.1, 0.2, 0.3, 0.4]

for i in range(4):
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_sizes[i], random_state=32)
    
    # Train the classifier
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    
    # Evaluate the classifier on the testing set
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for split {i+1} with test size {test_sizes[i]}: {accuracy:.3f}")

Accuracy for split 1 with test size 0.1: 0.933
Accuracy for split 2 with test size 0.2: 0.967
Accuracy for split 3 with test size 0.3: 0.978
Accuracy for split 4 with test size 0.4: 0.983


### Dataset 4 -> Wine

##### Loading...

In [13]:
from sklearn.datasets import load_wine

# Load the dataset
wine = load_wine()

df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['target'] = pd.Series(wine.target)
df.tail()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.7,0.64,1.74,740.0,2
174,13.4,3.91,2.48,23.0,102.0,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840.0,2
177,14.13,4.1,2.74,24.5,96.0,2.05,0.76,0.56,1.35,9.2,0.61,1.6,560.0,2


##### Preprocessing...

In [14]:
# Preprocess the data
scaler = StandardScaler()
X = scaler.fit_transform(wine.data)
y = wine.target

##### Splitting and Evaluating...

In [15]:
# Train and evaluate the classifier using 4 test train splits
test_sizes = [0.1, 0.2, 0.3, 0.4]

for i in range(4):
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_sizes[i], random_state=8521)
    
    # Train the classifier
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    
    # Evaluate the classifier on the testing set
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for split {i+1} with test size {test_sizes[i]}: {accuracy:.3f}")

Accuracy for split 1 with test size 0.1: 0.944
Accuracy for split 2 with test size 0.2: 0.972
Accuracy for split 3 with test size 0.3: 0.981
Accuracy for split 4 with test size 0.4: 0.986


# SUPPORT VECTOR MACHINES

### Dataset 1 -> Iris

##### Loading...

In [16]:
from sklearn.datasets import load_iris

iris = load_iris()

# Convert to a dataframe
import pandas as pd
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


##### Preprocessing...

In [17]:
# Preprocess the data
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('target', axis=1))
y = df['target']

##### Splitting and Evaluating...

In [18]:
# Train and evaluate the classifier using 4 test train splits
test_sizes = [0.1, 0.2, 0.3, 0.4]

for i in range(4):
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_sizes[i], random_state=7)
    
    # Train the classifier
    clf = SVC()
    clf.fit(X_train, y_train)
    
    # Evaluate the classifier on the testing set
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for split {i+1} with test size {test_sizes[i]}: {accuracy:.3f}")

Accuracy for split 1 with test size 0.1: 0.867
Accuracy for split 2 with test size 0.2: 0.867
Accuracy for split 3 with test size 0.3: 0.911
Accuracy for split 4 with test size 0.4: 0.950


### Dataset 2 -> Digit dataset (MNIST)

##### Loading...

In [23]:
from sklearn.datasets import load_digits

digits = load_digits()

df = pd.DataFrame(data=digits.data)
df['target'] = digits.target
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


##### Preprocessing...

In [25]:
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('target', axis=1))
y = df['target']

##### Splitting and Evaluating...

In [26]:
test_sizes = [0.1, 0.2, 0.3, 0.4]

for i in range(4):
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_sizes[i], random_state=7)
    
    # Train the classifier
    clf = SVC()
    clf.fit(X_train, y_train)
    
    # Evaluate the classifier on the testing set
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for split {i+1} with test size {test_sizes[i]}: {accuracy:.3f}")

Accuracy for split 1 with test size 0.1: 0.989
Accuracy for split 2 with test size 0.2: 0.994
Accuracy for split 3 with test size 0.3: 0.993
Accuracy for split 4 with test size 0.4: 0.986


### Dataset 3 -> Breast Cancer

##### Loading...

In [28]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

import pandas as pd
df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


##### Preprocessing...

In [30]:
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('target', axis=1))
y = df['target']

##### Splitting and Evaluating...

In [32]:
test_sizes = [0.1, 0.2, 0.3, 0.4]

for i in range(4):
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_sizes[i], random_state=72)
    
    # Train the classifier
    clf = SVC()
    clf.fit(X_train, y_train)
    
    # Evaluate the classifier on the testing set
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for split {i+1} with test size {test_sizes[i]}: {accuracy:.3f}")

Accuracy for split 1 with test size 0.1: 0.947
Accuracy for split 2 with test size 0.2: 0.956
Accuracy for split 3 with test size 0.3: 0.965
Accuracy for split 4 with test size 0.4: 0.965


### Dataset 4 -> Wine

##### Loading...

In [33]:
from sklearn.datasets import load_wine

wine = load_wine()

import pandas as pd
df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
df['target'] = wine.target
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


##### Preprocessing...

In [34]:
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('target', axis=1))
y = df['target']

##### Splitting and Evaluating...

In [45]:
test_sizes = [0.1, 0.2, 0.3, 0.4]

for i in range(4):
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_sizes[i], random_state=90)
    
    # Train the classifier
    clf = SVC()
    clf.fit(X_train, y_train)
    
    # Evaluate the classifier on the testing set
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for split {i+1} with test size {test_sizes[i]}: {accuracy:.3f}")

Accuracy for split 1 with test size 0.1: 0.944
Accuracy for split 2 with test size 0.2: 0.917
Accuracy for split 3 with test size 0.3: 0.963
Accuracy for split 4 with test size 0.4: 0.972
