In [82]:
# import all needed modules
import warnings
warnings.filterwarnings("ignore")

import googleapiclient.discovery
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from keras_preprocessing.sequence import pad_sequences
import regex
import pickle
import pandas as pd
import csv

pd.set_option('display.max_rows', None)
pd.options.display.max_colwidth = 100

# import functions to produce measurements of results
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score

### GET COMMENTS

In [2]:
# set up key, secret, api version, api service name
%store -r API_KEY
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

In [3]:
# load model
spam_model = tf.keras.models.load_model("youtube_spam_model")

2023-03-01 23:57:51.042869: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# load tokenizer
with open('youtube_spam_model/tokenizer.pkl', 'rb') as tk:
    tokenizer = pickle.load(tk)

In [38]:
# gets the user inputted link to a youtube video 
link = input('Input the link to the video to analyze comments from: ')

Input the link to the video to analyze comments from: https://www.youtube.com/watch?v=oe4S-q_gQaU&ab_channel=SpaceXa


In [66]:
# gets the comment for an inputted youtube video link
def get_comments(link):
    alphanum_regex = r'[^\x00-\x7F]+'
    
    youtube = googleapiclient.discovery.build(API_SERVICE_NAME, API_VERSION, developerKey = API_KEY)
    
    if 'ab_channel' in link:
        link = link.split('&')[0]
    video_id = link.split('v=')[1]
    
    count = 0
    total = get_total_comments(youtube, video_id)
    keep_going = True
    
    response = youtube.commentThreads().list(
        part = 'snippet,replies',
        videoId = video_id).execute()
    
    comments = []
    while response and keep_going:
        for item in response['items']:
            comments.append(get_single_comment(item, alphanum_regex))
            keep_going, count = progress_tracker(count, total)
            
            if item['snippet']['totalReplyCount'] > 0:
                keep_going, count, replies = get_all_replies(item, alphanum_regex, count, total)
                comments.extend(replies)
                    
        if 'nextPageToken' in response:
            response = youtube.commentThreads().list(
                part = 'snippet,replies',
                videoId = video_id).execute()
        else:
            keep_going = False
            
    return comments[:total]

In [57]:
# gets the total number of comments for a video_id
def get_total_comments(youtube, video_id):
    response = youtube.videos().list(
        part = 'statistics',
        id = video_id).execute()
    
    total = response['items'][0]['statistics']['commentCount']
    
    return int(total)

In [67]:
# gets a single comment from the response
def get_single_comment(item, alphanum_regex):
    comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
    filtered_comment = regex.sub(alphanum_regex, '', comment)
        
    return filtered_comment

In [68]:
# gets all the replies of a comment
def get_all_replies(item, alphanum_regex, count, total):
    replies = []
    
    for reply in item['replies']['comments']:
        reply_text = reply['snippet']['textDisplay']
        filtered_reply_text = regex.sub(alphanum_regex, '', reply_text)
        replies.append(filtered_reply_text)
        keep_going, count = progress_tracker(count, total)
        
        if not keep_going:
            break
    
    return keep_going, count, replies

In [69]:
# tracks the status of getting the comments
def progress_tracker(count, total):
    count += 1
    keep_going = count < total + 1
    
    if keep_going:
        pct = round((count / total) * 100, 2)
        print(f'Progress: {count}/{total}, {pct}% Completed')
    return keep_going, count

In [70]:
#call the get_comments function with the inputted link, store as list of Strings (comments)
comments = get_comments(link)

Progress: 1/471, 0.21% Completed
Progress: 2/471, 0.42% Completed
Progress: 3/471, 0.64% Completed
Progress: 4/471, 0.85% Completed
Progress: 5/471, 1.06% Completed
Progress: 6/471, 1.27% Completed
Progress: 7/471, 1.49% Completed
Progress: 8/471, 1.7% Completed
Progress: 9/471, 1.91% Completed
Progress: 10/471, 2.12% Completed
Progress: 11/471, 2.34% Completed
Progress: 12/471, 2.55% Completed
Progress: 13/471, 2.76% Completed
Progress: 14/471, 2.97% Completed
Progress: 15/471, 3.18% Completed
Progress: 16/471, 3.4% Completed
Progress: 17/471, 3.61% Completed
Progress: 18/471, 3.82% Completed
Progress: 19/471, 4.03% Completed
Progress: 20/471, 4.25% Completed
Progress: 21/471, 4.46% Completed
Progress: 22/471, 4.67% Completed
Progress: 23/471, 4.88% Completed
Progress: 24/471, 5.1% Completed
Progress: 25/471, 5.31% Completed
Progress: 26/471, 5.52% Completed
Progress: 27/471, 5.73% Completed
Progress: 28/471, 5.94% Completed
Progress: 29/471, 6.16% Completed
Progress: 30/471, 6.37% Co

Progress: 267/471, 56.69% Completed
Progress: 268/471, 56.9% Completed
Progress: 269/471, 57.11% Completed
Progress: 270/471, 57.32% Completed
Progress: 271/471, 57.54% Completed
Progress: 272/471, 57.75% Completed
Progress: 273/471, 57.96% Completed
Progress: 274/471, 58.17% Completed
Progress: 275/471, 58.39% Completed
Progress: 276/471, 58.6% Completed
Progress: 277/471, 58.81% Completed
Progress: 278/471, 59.02% Completed
Progress: 279/471, 59.24% Completed
Progress: 280/471, 59.45% Completed
Progress: 281/471, 59.66% Completed
Progress: 282/471, 59.87% Completed
Progress: 283/471, 60.08% Completed
Progress: 284/471, 60.3% Completed
Progress: 285/471, 60.51% Completed
Progress: 286/471, 60.72% Completed
Progress: 287/471, 60.93% Completed
Progress: 288/471, 61.15% Completed
Progress: 289/471, 61.36% Completed
Progress: 290/471, 61.57% Completed
Progress: 291/471, 61.78% Completed
Progress: 292/471, 62.0% Completed
Progress: 293/471, 62.21% Completed
Progress: 294/471, 62.42% Comple

### CLEAN AND PREDICT

In [71]:
# remove all empty strings
comments = [comment for comment in comments if len(comment) > 0]

In [72]:
# encode text into integers based on training
encoded_comments = tokenizer.texts_to_sequences(comments)

In [73]:
# make all text the same length of 10 at most
text_size = 10
padded_comments = pad_sequences(encoded_comments, maxlen = text_size, padding = 'post')

In [74]:
# use the model to predict the comments
spam_preds = (spam_model.predict(padded_comments) > 0.5).astype("int32")



In [76]:
# create a list from tuple of tuples
preds = [pred_value for each_pred in spam_preds for pred_value in each_pred]

In [77]:
# create dataframe with comments and preds
comment_pred_df = pd.DataFrame({'CONTENT' : comments, 'SPAM' : preds}, columns = ['CONTENT', 'SPAM'])

In [83]:
comment_pred_df

Unnamed: 0,CONTENT,SPAM
0,Awesome,0
1,so funny Starlink Mission,0
2,I cant wait to finish joining the space force it will be life changing,0
3,I saw the landing booster as it came into orbit. It looked like an odd ufo throwing a plume of s...,0
4,We need starlink in Pennsylvania,0
5,I wonder why russian bootlicker Space Karen(aka Elon Musk) turned off/slowed down Starlinks in U...,0
6,How in the world did that thing land in the perfect spot?! I hear that it&#39;s called &quot;Sta...,0
7,Congratulations SpaceX,0
8,Im tired of lagging on fifa so Im here,0
9,The best footage in this Verse!,0


In [79]:
# append comments model predicted to spam_preds.csv
comment_pred_df.to_csv('spam_preds.csv', mode = 'a', index = False, header = False)

### USER CORRECTION

In [80]:
# print dataframe so user can see and edit if needed
comment_pred_df

Unnamed: 0,CONTENT,SPAM
0,Awesome,0
1,so funny Starlink Mission,0
2,I cant wait to finish joining the space force ...,0
3,I saw the landing booster as it came into orbi...,0
4,We need starlink in Pennsylvania,0
5,I wonder why russian bootlicker Space Karen(ak...,0
6,How in the world did that thing land in the pe...,0
7,Congratulations SpaceX,0
8,Im tired of lagging on fifa so Im here,0
9,The best footage in this Verse!,0


In [81]:
# fix the spam value for all indexes inputted
def fix_spam(indexes):
    for index in indexes:
        if comment_pred_df['SPAM'][index] == 1:
            comment_pred_df['SPAM'][index] = 0
        else:
            comment_pred_df['SPAM'][index] = 1

In [None]:
# call fix_spam like this
# fix_spam([2, 4, 8, 9]) for if 2, 4, 8, and 9 are predicted wrong
fix_spam([])

In [None]:
# print dataframe so user can see and verify changes
comment_pred_df

In [None]:
# append comments model is trained with to spam_comments.csv
# add new column PREDICTED with all values being True
comment_pred_df['PREDICTED'] = True
comment_pred_df.to_csv('spam_comments.csv', mode = 'a', index = False, header = False)

### TRAIN MODEL WITH CORRECTED DATA

In [None]:
# set early stopping when the model is not improving anymore
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

# train model with the corrected data
spam_model.fit(x = padded_comments, y = comment_pred_df['SPAM'], epochs = 50,
               verbose = 1, callbacks=[early_stop])

In [None]:
# save the model
spam_model.save("youtube_spam_model")

In [None]:
# save tokenizer
with open('youtube_spam_model/tokenizer.pkl', 'wb') as output:
   pickle.dump(tokenizer, output, pickle.HIGHEST_PROTOCOL)

### CALCULATE SUMMARY STATS

In [None]:
# get previous predictions
pred_df = pd.read_csv('spam_preds.csv')
pred_content_df = pred_df[['CONTENT', 'SPAM']]

# get previous true spam values
comments_df = pd.read_csv('spam_comments.csv')
spam_content_df = comments_df[['CONTENT', 'CLASS']][comments_df['PREDICTED'] == True]

In [None]:
# sort by content such that the spam binary classifier at each index will be
# corresponding between the two dataframes
pred_content_df.sort_values(by = 'CONTENT', ascending = True, inplace = True)
spam_content_df.sort_values(by = 'CONTENT', ascending = True, inplace = True)

In [None]:
# show the accuracy score of the model
print("Accuracy: " + str(accuracy_score(spam_content_df['CLASS'], pred_content_df['SPAM'])))