In [114]:
import pandas as pd
import nltk
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

file_names = ('ranjan-youtube-comments.csv', 'music-youtube-comments.csv', 
              'sangeethe-youtube-comments.csv', 'ratta-youtube-comments.csv', 'vlog-youtube-comments.csv')
df = pd.DataFrame()

# post-comment similarity and inter-comment similarity
for file in file_names:
    df_temp = pd.read_csv(file)
    df_temp['p_jc_similarity'] = 0.0
    df_temp['jc_similarity'] = 0.0
    des_tkns = set(nltk.tokenize.word_tokenize(str(df_temp['description'][0])))

    for index, row in df_temp.iterrows():
        tkns = set(nltk.tokenize.word_tokenize(str(row['text_original'])))
        intersection = len(des_tkns.intersection(tkns))
        union = len(des_tkns) + len(tkns) - intersection
        df_temp.at[index, 'p_jc_similarity'] = intersection / union
        total = 0
        for i, snt in df_temp.iterrows():
            if i == index:
                continue
            tkns2 = set(nltk.tokenize.word_tokenize(str(snt['text_original'])))
            intersection = len(tkns.intersection(tkns2))
            union = len(tkns) + len(tkns2) - intersection
            total = total + (intersection / union)
        df_temp.at[index, 'jc_similarity'] = total / (len(df_temp) - 1)
    df = df.append(df_temp)

In [115]:
#the interval between post and comment
t1 = pd.to_datetime(df['publishedAt'])
t2 = pd.to_datetime(df['videoUploadedAt'])
df['interval'] = (t1 - t2).dt.total_seconds() # seconds

In [116]:
#number of words in the comment
df['no_words'] = df['text_original'].str.split().apply(len)

In [117]:
#comment length
df['comment_length'] = df['text_original'].str.len()

In [118]:
#stop words ratio
stopwords = pd.read_csv('stopwords.txt', header=None)[0].to_numpy()

df['stop_w_ratio'] = 0.0

for index, row in df.iterrows():
    count = 0
    tkns = nltk.tokenize.word_tokenize(str(row['text_original']))
    for tk in tkns:
        if tk in stopwords:
            count = count + 1
    df.at[index, 'stop_w_ratio'] = count / len(tkns)

In [119]:
#link, email, phone and black words
df['has_link'] = 0
df['has_subscribe'] = 0
df['has_email'] = 0
df['has_phone'] = 0

word_list = ['subscribe', 'subcribe', 'subcrib', 'සබ්ස්']

for index, row in df.iterrows():
    if any(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(row['text_original']))):
        df.at[index, 'has_link'] = 1
    
    if any(re.findall(r'|'.join(word_list), str(row['text_original']), re.IGNORECASE)):
        df.at[index, 'has_subscribe'] = 1
        
    if any(re.findall(r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)', str(row['text_original']))):
        df.at[index, 'has_email'] = 1
        
    if any(re.findall(r'[0-9]{7,}', str(row['text_original']))):
        df.at[index, 'has_phone'] = 1

In [120]:
#no of sentences
df['no_sentences'] = df['text_original'].str.replace('?', '.').str.split('.').apply(len)

In [121]:
#word duplication ratio
df['word_dup_ratio'] = 0.0

for index, row in df.iterrows():
    count = dict()
    words = row['text_original'].split()
    for word in words:
        if word not in count:
            count[word] = 0
        count[word] += 1
    
    duplicates = 0
    for item in count:
        if count[item] > 1:
            duplicates += 1
    df.at[index, 'word_dup_ratio'] = duplicates / len(words)

In [122]:
# alpha numeric and non alpha numeric char ratio
alphanum = pd.read_csv('sin_letters.txt', header=None)[0].to_numpy()

df['al_num_ratio'] = 0.0
df['non_al_num_ratio'] = 0.0

for index, row in df.iterrows():
    count = 0
    comment = str(row['text_original'])
    com_len = len(comment)
    for ch in comment:
        if ch in alphanum:
            count = count + 1
    df.at[index, 'al_num_ratio'] = count / com_len
    df.at[index, 'non_al_num_ratio'] = (com_len - count) / com_len

In [123]:
# negative word count and ratio
negativewords = pd.read_csv('negative-words_si.txt', header=None)[0].to_numpy()

df['neg_w_count'] = 0
df['neg_w_ratio'] = 0.0

for index, row in df.iterrows():
    count = 0
    tkns = nltk.tokenize.word_tokenize(str(row['text_original']))
    for tk in tkns:
        if tk in negativewords:
            count = count + 1
    df.at[index, 'neg_w_count'] = count
    df.at[index, 'neg_w_ratio'] = count / len(tkns)

In [124]:
# positive word count and ratio
positivewords = pd.read_csv('positive-words_si.txt', header=None)[0].to_numpy()

df['pos_w_count'] = 0
df['pos_w_ratio'] = 0.0

for index, row in df.iterrows():
    count = 0
    tkns = nltk.tokenize.word_tokenize(str(row['text_original']))
    for tk in tkns:
        if tk in positivewords:
            count = count + 1
    df.at[index, 'pos_w_count'] = count
    df.at[index, 'pos_w_ratio'] = count / len(tkns)

In [125]:
user_details = df[df.is_spam == 1][['authorChannelId', 'subscriberCount', 'videoCount', 'channelViewCount']]
user_details

Unnamed: 0,authorChannelId,subscriberCount,videoCount,channelViewCount
1,UCJh8tRprawVFIk7oHbIUNgw,19,4,141
4,UCVFCjr3prM5nWNZRixyPIqg,0,11,117133
6,UClxuGJD4y3fQ6ew21mj0ApA,16,13,331
7,UCMVZFl51gTtuUmO6iav33ew,0,0,0
8,UCX1MlmfPHjcZahA_2IbMH0Q,11,2,419
13,UCdOyi1XOX1iFwvOgJzDAnAA,113,11,833
14,UCjU7UzFjtppLWQNl6s9F8lw,1,8,95
15,UCF7825i1T5FQmCYDPQoKqiw,226,89,20236
16,UC3n9NwEr7MGksjbzEX2-Gww,2390,30,512750
17,UCuZE2E3l8LDEEJhMQJOw0xA,1,4,119


In [130]:
X = df[['has_link', 'has_subscribe', 'has_email', 'has_phone', 'p_jc_similarity', 'jc_similarity', 'likeCount', 'interval', 'no_words', 'comment_length', 'stop_w_ratio', 'no_sentences', 'word_dup_ratio', 'al_num_ratio', 'non_al_num_ratio', 'neg_w_count', 'neg_w_ratio', 'pos_w_count', 'pos_w_ratio']]
y = df['is_spam']
X.head()

Unnamed: 0,has_link,has_subscribe,has_email,has_phone,p_jc_similarity,jc_similarity,likeCount,interval,no_words,comment_length,stop_w_ratio,no_sentences,word_dup_ratio,al_num_ratio,non_al_num_ratio,neg_w_count,neg_w_ratio,pos_w_count,pos_w_ratio
0,1,0,0,0,0.04,0.008491,189,19459.0,10,53,0.21875,1,0.04,0.708333,0.291667,0,0.0,1,0.03125
1,1,0,0,0,0.0,0.080861,0,339711.0,3,64,0.15,3,0.0,0.612903,0.387097,0,0.0,0,0.0
2,1,0,0,0,0.033333,0.021526,0,338050.0,14,61,0.2,5,0.0,0.765957,0.234043,0,0.0,0,0.0
3,1,1,0,0,0.0,0.003782,0,334046.0,10,49,0.173913,1,0.102564,0.785714,0.214286,3,0.065217,0,0.0
4,1,1,0,0,0.026316,0.03359,0,333175.0,18,160,0.042553,4,0.0,0.384615,0.615385,0,0.0,0,0.0


In [189]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
gnb = MultinomialNB()
gnb.fit(x_train, y_train)
pred = gnb.predict(x_test)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0], dtype=int64)

In [190]:
np.array(y_test)

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [191]:
gnb.score(x_test, y_test)

0.6666666666666666