In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer


In [2]:
df = pd.read_csv('spamham',sep='\t',names=['status','message'])
df.head()

Unnamed: 0,status,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df['status'].value_counts()

ham     4825
spam     747
Name: status, dtype: int64

In [4]:
df.loc[df['status']=='ham','status'] = 1 
df.loc[df['status']=='spam','status'] = 0 

In [5]:
df['status'].value_counts()

1    4825
0     747
Name: status, dtype: int64

In [6]:
df_x = df['message']
df_y = df['status']

In [7]:
cv = CountVectorizer()

In [8]:
from sklearn.cross_validation import train_test_split



In [9]:
x_train,x_test,y_train,y_test = train_test_split(df_x,df_y,test_size = 0.2,random_state = 4)

In [11]:
#example
x_traincv = cv.fit_transform(['first is google','second is facebook'])


In [12]:
x_traincv.toarray()

array([[0, 1, 1, 1, 0],
       [1, 0, 0, 1, 1]], dtype=int64)

In [13]:
cv.get_feature_names()

['facebook', 'first', 'google', 'is', 'second']

In [15]:
cv.inverse_transform(x_traincv[1])

[array(['facebook', 'second', 'is'], 
       dtype='<U8')]

In [17]:
#actual spam-ham example
cv1 = CountVectorizer()

In [18]:

x_traincv = cv1.fit_transform(x_train)
x_traincv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
a = x_traincv.toarray()

In [21]:
a[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
cv1.inverse_transform(a[0]) #features of 1st sentance order isnot preserved

[array(['checking', 'going', 'got', 'haha', 'lor', 'mails', 'me', 'now',
        'online', 'or', 'replying', 'sleeping', 'spys', 'take', 'to', 'wat',
        'you'], 
       dtype='<U27')]

In [24]:
x_train.iloc[0] # 1st sentance showing 

'U sleeping now.. Or you going to take? Haha.. I got spys wat.. Me online checking n replying mails lor..'

# tfidf


In [25]:
cv = TfidfVectorizer(stop_words='english')

In [26]:
x_traincv = cv.fit_transform(x_train)

In [27]:
a = x_traincv.toarray()
a

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.19618715,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [28]:
a[0]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [30]:
cv.inverse_transform(a[0]) #stopwords removed

[array(['checking', 'going', 'got', 'haha', 'lor', 'mails', 'online',
        'replying', 'sleeping', 'spys', 'wat'], 
       dtype='<U27')]

In [31]:
x_train.iloc[0]

'U sleeping now.. Or you going to take? Haha.. I got spys wat.. Me online checking n replying mails lor..'

In [32]:
y_train = y_train.astype('int')

In [33]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

In [36]:
mnb.fit(x_traincv,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [37]:
x_testcv = cv.transform(x_test)

In [39]:
pred = mnb.predict(x_testcv)
pred

array([1, 1, 1, ..., 1, 1, 0])

In [41]:
actual = np.array(y_test)
actual

array([1, 1, 1, ..., 1, 1, 0], dtype=object)

In [43]:
count = 0
for i in range(len(pred)):
    if pred[i]==actual[i]:
        count+=1
        
        

In [44]:
count

1068

In [45]:
len(actual)

1115

In [47]:
1068.0/1115.0 #all the above code is self-explanatory( i assume)

0.957847533632287