# Supervised Learning | Classification (Naive Bayes)

Today's Outline:
- Intuition
- Full Case-study

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# sns.set()

==========

## Naive Bayes (NB) | Intuition (Iris)

Scikit-Learn Naive Bayes Module: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes

NLP Demo: https://natural-language-understanding-demo.ng.bluemix.net/

### Importing Dataset

In [None]:
from sklearn.datasets import load_iris

In [None]:
X, y = load_iris(return_X_y=True)

In [None]:
X

In [None]:
y

### Splitting & Preprocessing Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

### Model Training & Predicting 

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()

In [None]:
y_pred = gnb.fit(X_train, y_train).predict(X_test)

### Model Evaluation

In [None]:
from sklearn.metrics import classification_report,plot_confusion_matrix, accuracy_score

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print(accuracy_score(y_test,y_pred))

In [None]:
print(plot_confusion_matrix(gnb,X_test,y_test))

==========

## Naive Bayes | Full Case-study (Email Filtering)

### Importing Dataset & Extracting Features

In [None]:
emails = pd.read_csv('data/emails.csv')
emails

In [None]:
emails.info()

In [None]:
ham = emails[emails['spam']==0]
ham

In [None]:
spam = emails[emails['spam']==1]
spam

### Exploring Data

In [None]:
print('Spam percentage =', (len(spam) / len(emails) )*100,"%")

In [None]:
print('Ham percentage =', (len(ham) / len(emails) )*100,"%")

In [None]:
sns.countplot(x=emails['spam']) 

### Data Splitting & Preprocessing

In [None]:
# Text encoding example

# from sklearn.feature_extraction.text import CountVectorizer
# sample_data = ['I am good','I feel good','You are very good','So far so good']

# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(sample_data)

# print(vectorizer.get_feature_names())
# print()
# print(X.toarray())  

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()
emails_cv = cv.fit_transform(emails['text'])

In [None]:
print(cv.get_feature_names())

In [None]:
print(emails_cv.toarray())  

In [None]:
X = emails_cv.toarray()
X

In [None]:
y = emails['spam'].values
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Model Training & Predicting

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
clf = MultinomialNB()

In [None]:
clf.fit(X_train, y_train)

### Model Evaluation

In [None]:
y_pred = clf.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True)

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

*Test these email subjects:*

- 'Free money!!!'
- "Hi Ahmed, Please let me know if you need any further information. Thanks"
- 'Hello, I am Mustafa, I would like to book a hotel in Cairo by January 24th'
- 'money viagara!!!!!'

==========

# THANK YOU!