In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv('C:/Users/micronet computers/Desktop/Projects/Machine Learning/spam.csv', names=['Status','Message'], sep='\t')

In [3]:
df.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
len(df)

5572

In [5]:
df.isnull().sum()

Status     0
Message    0
dtype: int64

In [6]:
len(df[df.Status=='spam'])

747

In [7]:
len(df.Status)

5572

In [8]:
df[df.Status=='spam']

Unnamed: 0,Status,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [9]:
df.Status=='spam'

0       False
1       False
2        True
3       False
4       False
        ...  
5567     True
5568    False
5569    False
5570    False
5571    False
Name: Status, Length: 5572, dtype: bool

In [10]:
len(df.Status=='spam')

5572

In [11]:
df.loc[df['Status']=='ham', 'Status'] = 1
df

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [12]:
df.loc[df['Status']=='spam', 'Status'] = 0
df

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


###  Now define the feature columns and the target columns

In [13]:
X = df['Message']
X.shape

(5572,)

In [14]:
y = df['Status']
y.shape

(5572,)

In [15]:
cv = TfidfVectorizer(min_df = 1, stop_words='english')

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=100)

In [17]:
X_traincv = cv.fit_transform(X_train)
cv.get_feature_names()

['00',
 '000',
 '0089',
 '0121',
 '01223585236',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07008009200',
 '07090201529',
 '07090298926',
 '07123456789',
 '0721072',
 '07734396839',
 '07742676969',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '078',
 '07808247860',
 '07808726822',
 '07821230901',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081560665',
 '08448350055',
 '0845',
 '08450542832',
 '08452810071',
 '08452810073',
 '08452810075over18',
 '0870',
 '08700621170150p',
 '08701213186',
 '08701237397',
 '08701417012',
 '08701417012150p',
 '08701752560',
 '08702490080',
 '08702840625',
 '08704439680',
 '08704439680ts',
 '08706091795',
 '0870737910216yrs',
 '08707500020',
 '0

In [84]:
a = X_traincv.toarray()
a

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [74]:
len(a[0])

6549

In [85]:
cv.inverse_transform(a[0])

[array(['ga', 'goodmorning', 'sleeping'], dtype='<U34')]

In [86]:
X_train.iloc[0]

'Goodmorning sleeping ga.'

In [18]:
mnb = MultinomialNB()

In [19]:
y_train = y_train.astype('int')
type(y_train)

pandas.core.series.Series

In [20]:
mnb.fit(X_traincv, y_train)

MultinomialNB()

In [24]:
X_testcv = cv.fit_transform(X_test)
X_testcv.shape

(2229, 5067)

In [23]:
mnb.predict(X_testcv)

ValueError: Expected 2D array, got 1D array instead:
array=[<2229x5067 sparse matrix of type '<class 'numpy.float64'>'
	with 17588 stored elements in Compressed Sparse Row format>].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.