## Naive Bayes Classifier Algorithm
### Titanic Dataset

In [1]:
import pandas as pd
df = pd.read_csv("C:/Users/yugal/Downloads/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [2]:
df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis = 'columns', inplace = True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [3]:
target = df['Survived']
inputs = df.drop('Survived', axis = 'columns')

In [4]:
# making dummies for sex column
dummies = pd.get_dummies(inputs['Sex'])
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [5]:
inputs = pd.concat([inputs, dummies], axis = 'columns')
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0
3,1,female,35.0,53.1,1,0
4,3,male,35.0,8.05,0,1


In [6]:
# checking if input have any missing values
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [7]:
# lets explore the missing values of Age column
inputs['Age'][:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [8]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0
3,1,female,35.0,53.1,1,0
4,3,male,35.0,8.05,0,1


In [9]:
inputs.drop(['Sex'], axis = 'columns', inplace = True)
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size = 0.2)

In [11]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [12]:
model.fit(X_train, y_train)

In [13]:
model.score(X_train, y_train)

0.7935393258426966

In [14]:
X_test[:10]

Unnamed: 0,Pclass,Age,Fare,female,male
644,3,0.75,19.2583,1,0
206,3,32.0,15.85,0,1
744,3,31.0,7.925,0,1
304,3,29.699118,8.05,0,1
346,2,40.0,13.0,1,0
262,1,52.0,79.65,0,1
579,3,32.0,7.925,0,1
155,1,51.0,61.3792,0,1
605,3,36.0,15.55,0,1
114,3,17.0,14.4583,1,0


In [15]:
y_test[:10]

644    1
206    0
744    1
304    0
346    1
262    0
579    1
155    0
605    0
114    0
Name: Survived, dtype: int64

In [16]:
model.predict(X_test[:10])

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 1], dtype=int64)

In [17]:
model.predict_proba(X_test[:10])

array([[0.019896  , 0.980104  ],
       [0.99244059, 0.00755941],
       [0.99227557, 0.00772443],
       [0.992173  , 0.007827  ],
       [0.02204566, 0.97795434],
       [0.75891292, 0.24108708],
       [0.99234606, 0.00765394],
       [0.87270378, 0.12729622],
       [0.99262628, 0.00737372],
       [0.03605458, 0.96394542]])

### Email spam Detection

In [18]:
import pandas as pd
df = pd.read_csv("C:/Users/yugal/Downloads/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [20]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Message'] , df['spam'], test_size=0.25)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count, y_train)

In [24]:
emails = [
    'Hey mohan, can we get together to watch football game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [25]:
X_test_count = v.transform(X_test)
model.score(X_test_count, y_test)

0.9856424982053122

In [26]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [27]:
clf.fit(X_train, y_train)

In [28]:
clf.score(X_test, y_test)

0.9856424982053122

In [29]:
clf.predict(emails)

array([0, 1], dtype=int64)