In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

df = pd.read_csv('youtube-comments.csv')
df.shape

(892, 7)

In [2]:
df.head()

Unnamed: 0,id,video_id,comment_id,text_display,text_original,is_spam,checked
0,2,1z4ughaHFj4,UgyOqRur7XVuSZsC-uN4AaABAg,"<a href=""https://youtu.be/9XHO4W7_4UI"">https:/...",https://youtu.be/9XHO4W7_4UI,1,1
1,5,1z4ughaHFj4,UgwKDIi_qJwGP2U0rOx4AaABAg,ලස්සන්‍යි බන්...වීඩියො එක බලන්නෙ නෙ සින්දුව අහ...,ලස්සන්‍යි බන්...වීඩියො එක බලන්නෙ නෙ සින්දුව අහ...,0,1
2,6,1z4ughaHFj4,Ugyeu3pV4VglihhGiOh4AaABAg,????පව් කරලා වැඩක් නැ සිල් අරන්.....මම් යන්නම ...,????පව් කරලා වැඩක් නැ සිල් අරන්.....මම් යන්නම ...,0,1
3,14,1z4ughaHFj4,UgzhtHRdH0xJUVZ8pvp4AaABAg,රැල්ලට ගහගෙන යන්න එපා අයියේ. වෙනස් දෙයක් කරන්න...,රැල්ලට ගහගෙන යන්න එපා අයියේ. වෙනස් දෙයක් කරන්න...,0,1
4,20,1z4ughaHFj4,UgwMWsprBrAjYKlyF1x4AaABAg,"වොයිස් එකට හදපන්කෝ හොද සිංදුවක්,, අඩන සිංදු නැ...","වොයිස් එකට හදපන්කෝ හොද සිංදුවක්,, අඩන සිංදු නැ...",0,1


In [4]:
len(df[df.is_spam==1])

97

In [8]:
df['text_original'] = df['text_original'].astype(str)
X = df['text_original']
y = df['is_spam']

In [9]:
cv = CountVectorizer()
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
x_train_vectorized = cv.fit_transform(x_train)
a = x_train_vectorized.toarray()
a

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
len(a[0])

1053

In [11]:
a[3]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [12]:
cv.inverse_transform(a[3])

[array(['අපට', 'ඇඩ', 'කර', 'ගහපල', 'තන', 'බලන', 'මග', 'මන'], dtype='<U24')]

In [13]:
x_train.iloc[3]

'මේව බලන අපට ගහපල්ලා මගුල ඇඩ්මනෙ තනිකර'

In [14]:
mnb = MultinomialNB()
mnb.fit(x_train_vectorized, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
x_test_vectorized = cv.transform(x_test)
x_test_vectorized

<179x1053 sparse matrix of type '<class 'numpy.int64'>'
	with 1198 stored elements in Compressed Sparse Row format>

In [16]:
pred = mnb.predict(x_test_vectorized)
pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0], dtype=int64)

In [17]:
len(pred)

179

In [18]:
actual = np.array(y_test)
actual

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0], dtype=int64)

In [19]:
correct_guesses = 0;
for i in range(0, len(pred)):
    if actual[i] == pred[i]:
        correct_guesses += 1

correct_guesses

172

In [20]:
accuracy = 172/179.0
accuracy

0.9608938547486033