In [1]:
# Importing necessary libraries. 
import pandas as pd
import numpy as np
from datetime import datetime
import json
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import random
import scipy.sparse
from datasets import load_dataset, Dataset
import gensim
from collections import Counter
from textblob import TextBlob

import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer



In [2]:
# Downloading important NLTK packagees. 
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec22283/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec22283/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec22283/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /home/ec22283/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec22283/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Reading in the comments dataset 
comments = pd.read_csv("comments.csv")

In [4]:
numbers = ["0","1","2","3","4","5","6","7","8","9"]
def num(word):
    for letter in word:
        if letter.isdigit():
            return True 
    else:
        return False

In [5]:
# Define process_text function. 
tokenizer = RegexpTokenizer('\s+', gaps = True)
lem = lemmatizer = WordNetLemmatizer()
def process_text(document):
    
    # Convert all words to lowercase. 
    document = document.lower()
    # Remove all punctuation 
    no_punc = tokenizer.tokenize(document)
    #Remove all digits. 
    no_num = [word for word in no_punc if num(word) == False]
    # Remove words which are not longer than a single world.
    single = [word for word in no_num if len(word) > 1]
    # lemmatize all words. 
    lemmed = [lem.lemmatize(word) for word in single]
    return lemmed 

In [6]:
# Convert comment series into comment list.  
working_text = comments["body"].to_list()

In [7]:
# Apply process_text function. 
processed_text = [process_text(doc) for doc in tqdm(working_text)]

  0%|          | 0/4600698 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Join each sentence. 
processed_join = [" ".join(doc) for doc in tqdm(processed_text)]

In [None]:
# Count the occurence of each word across the corpus.
counter = Counter()
for doc in tqdm(processed_text):
    # Each comment is turned into a set, removing duplicates. This ensures the counter is returning 
    # document frequency of each word rather than total corpus frequency. 
    counter.update(set(doc))


In [None]:
# Order the counter. 
ordered_counter = counter.most_common()

In [None]:
# Generate lists of words which either occur in over 40% of comments or less than 100 comments. 
under_list = [word for (word, count) in ordered_counter if count < 100]
over_list = [word for (word, count) in ordered_counter if count > (len(processed_text)*0.4)]

In [None]:
# Importing gensim processing functions. 
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS

In [None]:
# Generating custom stopword list. 
custom_stop = ["www", "https", "com", "http", "people", "like", "gt", "___ ", "amp", "org", "ve", "en", "httml", "np", "pdf"]
# Instancing default Gensim stopword list. 
all_stopwords_gensim = gensim.parsing.preprocessing.STOPWORDS
# Combining list of all words to be removed into one stopword list.
all_stopwords_gensim = STOPWORDS.union(set(under_list + over_list + custom_stop))

In [None]:
# Apply stopword removal to process. 
process = [remove_stopwords(doc, stopwords = all_stopwords_gensim) for doc in tqdm(processed_join)]

In [None]:
# Splits each comment into words. 
split_process = [doc.split() for doc in tqdm(process)]

In [None]:
# Splits into batches. 
noun_one = split_process[0:1000000]
noun_two = split_process[1000000:2000000]
noun_three = split_process[2000000:3000000]
noun_four = split_process[3000000:]

In [None]:
# Shows length of all batches combined. 
len(noun_one) + len(noun_two) + len(noun_three) + len(noun_four)

In [None]:
# Assert statements to ensure respective eelements of the batches and the original list are equal. 
assert(noun_one[-1] == split_process[999999])
assert(noun_two[-1] == split_process[1999999])
assert(noun_three[-1] == split_process[2999999])
assert(noun_four[-1] == split_process[-1])
assert((len(noun_one) + len(noun_two) + len(noun_three) + len(noun_four)) == len(split_process))

In [None]:
# Define noun removal function to remove all non-nouns. 
def nouns_only(text):
    return [word for (word, tag) in TextBlob(text).tags if tag == "NN"] 


In [None]:
# Apply noun removal function and upload batch to huggingface. 
#nouns_one_processed = [nouns_only(" ".join(doc)) for doc in tqdm(noun_one)]
#processed_dataset = Dataset.from_pandas(pd.DataFrame({"nouns":nouns_one_processed}))
#processed_dataset.push_to_hub("bartoszmaj/nouns_one")

#import_data = load_dataset("bartoszmaj/nouns_one")
#import_df = pd.DataFrame({"body":import_data["train"]["nouns"]})
#nouns_one = import_df["body"].to_list()

In [None]:
#nouns_two_processed = [nouns_only(" ".join(doc)) for doc in tqdm(noun_two)]
#processed_dataset = Dataset.from_pandas(pd.DataFrame({"nouns":nouns_two_processed}))
#processed_dataset.push_to_hub("bartoszmaj/nouns_two")

#import_data = load_dataset("bartoszmaj/nouns_two")
#import_df = pd.DataFrame({"body":import_data["train"]["nouns"]})
#nouns_two = import_df["body"].to_list()

In [None]:
#nouns_three_processed = [nouns_only(" ".join(doc)) for doc in tqdm(noun_three)]
#processed_dataset = Dataset.from_pandas(pd.DataFrame({"nouns":nouns_three_processed}))
#processed_dataset.push_to_hub("bartoszmaj/nouns_three")

#import_data = load_dataset("bartoszmaj/nouns_three")
#import_df = pd.DataFrame({"body":import_data["train"]["nouns"]})
#nouns_three = import_df["body"].to_list()

In [None]:
#nouns_four_processed = [nouns_only(" ".join(doc)) for doc in tqdm(noun_four)]
#processed_dataset = Dataset.from_pandas(pd.DataFrame({"nouns":nouns_four_processed}))
#processed_dataset.push_to_hub("bartoszmaj/nouns_four")

#import_data = load_dataset("bartoszmaj/nouns_four")
#import_df = pd.DataFrame({"body":import_data["train"]["nouns"]})
#nouns_four = import_df["body"].to_list()

In [None]:
# Combine all processed nouns 
nouns_imported = nouns_one + nouns_two + nouns_three + nouns_four

In [None]:
# Upload processed comments to huggingface. 
#nouns_full = Dataset.from_pandas(pd.DataFrame({"nouns":nouns_imported}))
#nouns_full.push_to_hub("bartoszmaj/nouns_full")