In [2]:
# https://stackabuse.com/the-naive-bayes-algorithm-in-python-with-scikit-learn/
import pandas as pd

df = pd.read_table('dataset/SMSSpamCollection',  
                   sep='\t', 
                   header=None,
                   names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# preprocessing

In [3]:
df['label'] = df.label.map({'ham': 0, 'spam': 1})  

모두 소문자로 변환

In [4]:
df['message'] = df.message.map(lambda x: x.lower())  

마침표 제거

In [5]:
df['message'] = df.message.str.replace('[^\w\s]', '')  

 tokenization

In [6]:
import nltk  
# nltk.download('punkt')
df['message'] = df['message'].apply(nltk.word_tokenize)  
df.head()

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"[nah, i, dont, think, he, goes, to, usf, he, l..."


형태소분석

In [15]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x])  
df.head()

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazi, avail, onli,..."
1,0,"[ok, lar, joke, wif, u, oni]"
2,1,"[free, entri, in, 2, a, wkli, comp, to, win, f..."
3,0,"[u, dun, say, so, earli, hor, u, c, alreadi, t..."
4,0,"[nah, i, dont, think, he, goe, to, usf, he, li..."


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# This converts the list of words into space-separated strings
df['message'] = df['message'].apply(lambda x: ' '.join(x))
print(df.message.head())
count_vect = CountVectorizer()  
counts = count_vect.fit_transform(df['message'])  

0    go until jurong point crazi avail onli in bugi...
1                                ok lar joke wif u oni
2    free entri in 2 a wkli comp to win fa cup fina...
3          u dun say so earli hor u c alreadi then say
4    nah i dont think he goe to usf he live around ...
Name: message, dtype: object


In [17]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts = transformer.transform(counts)  

In [18]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=69) 

In [36]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)  
# model = GaussianNB().fit(X_train, y_train)  

In [56]:
import numpy as np
test_data = pd.read_csv( 'dataset/SMSSpamCollection_test.csv' )

test_data = test_data.rename(columns = {'id': 'label', 'review':'message'})

test_count = CountVectorizer()  
test_counts = test_count.fit_transform(test_data['message'])  
print(type(test_count))

transformer = TfidfTransformer().fit(test_count)

test_count = transformer.transform(test_count) 


predicted = model.predict(test_data)

<class 'sklearn.feature_extraction.text.CountVectorizer'>


ValueError: Expected 2D array, got scalar array instead:
array=CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None).
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [33]:
import numpy as np

predicted = model.predict(X_test)
# print(X_test)
# print(np.mean(predicted == y_test))  

  (0, 895)	0.28999797162758306
  (0, 1124)	0.17503953462676497
  (0, 1260)	0.1806775784485796
  (0, 1605)	0.28999797162758306
  (0, 1773)	0.11734341495843983
  (0, 2055)	0.26025028442328957
  (0, 2109)	0.12945014502730604
  (0, 3028)	0.1882030959165165
  (0, 3930)	0.28999797162758306
  (0, 3976)	0.2927349662641272
  (0, 3985)	0.24116759238885288
  (0, 5227)	0.10770668790321371
  (0, 5380)	0.28999797162758306
  (0, 5491)	0.27683441033783157
  (0, 6049)	0.28999797162758306
  (0, 6579)	0.267494714968356
  (0, 6780)	0.14918636698714263
  (0, 7109)	0.08721601188417048
  (0, 7846)	0.12158774440143195
  (0, 7919)	0.15447508303473523
  (0, 8130)	0.10192583382482392
  (1, 1160)	0.1073266226344332
  (1, 1417)	0.2846605090910865
  (1, 1597)	0.26992717420861423
  (1, 3336)	0.13037008878027842
  :	:
  (554, 7681)	0.139088862729437
  (554, 7871)	0.25318299772379493
  (554, 8113)	0.06505727058380065
  (554, 8130)	0.2669593308715244
  (555, 1304)	0.4081000495497748
  (555, 1805)	0.3583092712410307
  (

In [21]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, predicted))  

[[482   0]
 [ 29  47]]
