In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB


In [88]:
df = pd.read_csv("titanic.csv")
df.head(2)

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1


In [89]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked',],axis=1,inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [90]:
inputs = df.drop(['Survived'],axis=1)
target = df.Survived

In [91]:
dummies = pd.get_dummies(inputs.Sex,dtype=int)
inputs = pd.concat([dummies,inputs],axis=1)

In [92]:
inputs = inputs.drop(['Sex','male'],axis=1)

In [93]:
inputs.head()

Unnamed: 0,female,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [94]:
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [95]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())

In [96]:
inputs.head(10)

Unnamed: 0,female,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05
5,0,3,29.699118,8.4583
6,0,1,54.0,51.8625
7,0,3,2.0,21.075
8,1,3,27.0,11.1333
9,1,2,14.0,30.0708


In [97]:
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.2)

In [98]:
model = GaussianNB()

In [99]:
model.fit(X_train,y_train)

In [100]:
model.score(X_test,y_test)

0.7541899441340782

In [101]:
model.predict(X_test[:10])

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0], dtype=int64)

In [102]:
model.predict_proba(X_test[:10])

array([[0.96627541, 0.03372459],
       [0.29401838, 0.70598162],
       [0.71220569, 0.28779431],
       [0.42683491, 0.57316509],
       [0.91548939, 0.08451061],
       [0.26116375, 0.73883625],
       [0.96400663, 0.03599337],
       [0.96652991, 0.03347009],
       [0.55776744, 0.44223256],
       [0.96354658, 0.03645342]])

# exercise 

In [103]:
data = pd.read_csv("spam.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [104]:
data.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [105]:
data['spam'] = data['Category'].apply(lambda x: 1 if x == 'spam' else 0)
data.drop('Category',axis=1)

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will ü b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


In [106]:
X_train, X_test, y_train, y_test = train_test_split(data.Message,data.spam,test_size=0.25)

In [107]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [108]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count,y_train)

In [109]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [110]:
X_test_count = v.transform(X_test)
model.score(X_test_count, y_test)

0.9863603732950467

In [111]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [112]:
clf.fit(X_train, y_train)

In [113]:
clf.score(X_test,y_test)

0.9863603732950467

In [114]:
clf.predict(emails)

array([0, 1], dtype=int64)