In [1]:
# import necessary libraries
import json
import re

from collections import Counter
from datetime import datetime
from wordcloud import WordCloud
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# Load the data from the JSON file
with open('redditGamingData.json', 'r') as jsonFile:
    reddit_data = json.load(jsonFile)
    
with open('youtubeGamingData.json', 'r') as jsonFile:
    youtube_data = json.load(jsonFile)

In [3]:
# Print the number of posts and comments in Reddit data
print("Reddit:")
print(f"Total number of posts: {len(reddit_data)}")
print(f"Total number of comments: {sum(len(submission['comments']) for submission in reddit_data)}")

# Print the number of posts and comments in YouTube data
print("\nYouTube:")
print(f"Total number of posts: {len(youtube_data)}")
print(f"Total number of comments: {sum(len(videos['comments']) for videos in youtube_data)}")

Reddit:
Total number of posts: 833
Total number of comments: 25333

YouTube:
Total number of posts: 70
Total number of comments: 3064


In [4]:
# Function to count the number of words
def count_words(data):
    # initialise the counts with 0
    subWordCount = 0
    comWordCount = 0

    # Iterate through all submission to count the words in each submission
    for submission in data:
        
        # Count words in the title
        subWordCount += len(submission['title'].split())
    
        # count words in the comments
        for comment in submission['comments']:
            comWordCount += len(comment.split())

    print(f"Total number of words in posts: {subWordCount}")
    print(f"Total number of words in comments: {comWordCount}")

In [5]:
# Print the number of words in posts and comments in Reddit data before preprocessing
print("Reddit:")
count_words(reddit_data)

# Print the number of words in posts and comments in YouTube data before preprocessing
print("\nYouTube:")
count_words(youtube_data)

Reddit:
Total number of words in posts: 11800
Total number of words in comments: 1290522

YouTube:
Total number of words in posts: 600
Total number of words in comments: 73878


In [6]:
# Funtion to count the number of urls in the data
def count_urls(data):
    # Regular expression to match URLs
    url_pattern = r'http\S+|www\S+|https\S+'
    
    total_urls = 0

    for submission in data:
        # Count URLs in the title
        total_urls += len(re.findall(url_pattern, submission['title'], flags=re.MULTILINE))
    
        # Count URLs in the comments
        for comment in submission['comments']:
            total_urls += len(re.findall(url_pattern, comment, flags=re.MULTILINE))

    print(f"Total number of URLs in the data: {total_urls}")

In [7]:
# Print the number of urls in posts and comments in Reddit data before preprocessing
print("Reddit:")
count_urls(reddit_data)

# Print the number of urls in posts and comments in YouTube data before preprocessing
print("\nYouTube:")
count_urls(youtube_data)

Reddit:
Total number of URLs in the data: 1468

YouTube:
Total number of URLs in the data: 21


In [8]:
# Function to preprocess text
def preprocess_text(text):
    
    # remove non ascii characters
    if not text.isascii():
        return '' 
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Keep only alphabetic characters and spaces
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(lemmatized_words)

In [9]:
def clean_data(data):
    # Create an empty list to store clean data
    cleanData = []

    # iterate through submission to preprocess the data
    for submission in data:
        # Process title
        title = preprocess_text(submission['title'])
        
        # Process comments
        comments = []
        for comment in submission['comments']:
            comment = preprocess_text(comment)
        
            # Add only if comment is present
            if comment.strip():
                comments.append(comment)
     
        # Add only if title is present
        if title.strip():
            if 'score' in submission:
                submission = {
                    'title': title,
                    'date': submission['date'],
                    'ID' : submission['ID'],
                    'keyword': submission['keyword'],
                    'score': submission['score'],
                    'comments': comments
                }
            else:
                submission = {
                    'title': title,
                    'date': submission['date'],
                    'ID' : submission['ID'],
                    'keyword': submission['keyword'],
                    'comments': comments
                }
        cleanData.append(submission)
    return cleanData

In [10]:
# Preprocess Reddit data
cleanRedditData = clean_data(reddit_data)

# Preprocess YouTube data
cleanYTData = clean_data(youtube_data)

In [11]:
# Print the number of posts and comments in Reddit data after preprocessing
print("Reddit:")
print(f"Total number of posts: {len(cleanRedditData)}")
print(f"Total number of comments: {sum(len(submission['comments']) for submission in cleanRedditData)}")

# Print the number of posts and comments in YouTube data after preprocessing
print("\nYouTube:")
print(f"Total number of posts: {len(cleanYTData)}")
print(f"Total number of comments: {sum(len(videos['comments']) for videos in cleanYTData)}")

Reddit:
Total number of posts: 833
Total number of comments: 19619

YouTube:
Total number of posts: 70
Total number of comments: 2481


In [12]:
# Print the number of words in posts and comments in Reddit data after preprocessing
print("Reddit:")
count_words(cleanRedditData)

# Print the number of words in posts and comments in YouTube data after preprocessing
print("\nYouTube:")
count_words(cleanYTData)

Reddit:
Total number of words in posts: 7536
Total number of words in comments: 568778

YouTube:
Total number of words in posts: 470
Total number of words in comments: 32444


In [13]:
# Print the number of urls in posts and comments in Reddit data before preprocessing
print("Reddit:")
count_urls(cleanRedditData)

# Print the number of urls in posts and comments in YouTube data before preprocessing
print("\nYouTube:")
count_urls(cleanYTData)

Reddit:
Total number of URLs in the data: 159

YouTube:
Total number of URLs in the data: 2
