In [278]:
import csv 
import os
   
import pandas as pd 
import spacy 
# spacy.cli.download("pt_core_news_sm")
# spacy.cli.download("es_core_news_sm")

import requests 
from bs4 import BeautifulSoup
from dataclasses import dataclass

#Sentence Tokenization using sent_tokenize
import nltk
# nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
from nltk.tokenize import word_tokenize


#Detect language using detect_langs
import langdetect
from langdetect import detect_langs

#Detect language using Lingua
from lingua import Language, LanguageDetectorBuilder

import re

In [279]:

"""
First, I'm gonna get the entire corpus from the "Reddit Post Parsed" folder.
"""
#replace as needed
folder_name = "Election Day 2020"
file_path = os.path.join(folder_name, "log.csv")

all_post_titles = []
expected_no_comments = 0
corpus = ""
comment_urls = []
all_links = []
og_posts = []
comments_only = []

with open(file_path, mode = 'r') as file:
    link_column = []
    title_column = []
    comments_column = []
    all_no_comments = []
    csvFile = csv.reader(file)
    for line in csvFile:
        title_column.append(line[2])
        all_post_titles = title_column[1:]

        comments_column.append(line[9])
        all_no_comments = comments_column[1:]

        link_column.append(line[3])
        all_links = link_column[1:]
        
    for number in all_no_comments:
        expected_no_comments += int(number)

@dataclass 
class Post:
    post: str
    likes: int
    is_og: bool
    id: int
    length: int
    sentiment: str
    adj: str
    adj_count: int
    profanity_score: float

post_data = []

#loop to open all post titles in create one big corpus of all comments
def create_corpus(titles: list) -> str:
    """
    This function takes in a list of posts titles in the 
    folder "Reddit Post Parsed" and loops through each 
    csv file to filter for proper comments, that are not urls
    and deleted to return the corpus.

    Comments that are just links will be 
    appended to the list "comment_urls"!
    """
    global corpus
    global comment_urls
    global folder_name

    count_proper_comments = 0
    no_deleted_comments = 0
    empty = ""
    list_of_comments = []
    id = 1
    
    base_folder = folder_name
    for title in titles:
        count = 1
        title_csv = os.path.join(base_folder, title + "'s post.csv")
        if not os.path.isfile(title_csv):
            print(f"File '{title_csv}' not found.")
            continue
        with open(title_csv, mode='r', encoding='utf-8') as file:
            csv_reader = csv.reader(file)
            for row in csv_reader:
                likes = row[7]
                post = empty.join(row[9:]).strip()
                list_of_comments.append(post)
                if count == 2:
                    og_posts.append(post)
                    post_data.append(Post(post, likes, True, id, len(post), "", "", 0, 0))
                if count > 2:
                    comments_only.append(post)
                    post_data.append(Post(post, likes, False, id, len(post), "", "", 0, 0))
                count +=1
        id += 1

    # print(list_of_comments)
    # print(f'OG Posts: {og_posts}')

    for comment in list_of_comments:
        if comment.strip() != "Body":
            if comment.strip() == '"deleted"' or comment.strip() == '"removed"':
                no_deleted_comments +=1
                comment = ""
            if comment.strip().startswith('"https:'):
                comment_urls.append(comment.replace('"', "").strip())
            else:
                count_proper_comments += 1 
                corpus = corpus + " " + comment.replace("**", "").replace("#", "").strip()[1:-1] 
    print(f'Number of comments yielded for the corpus (that are not urls or deleted): {count_proper_comments}.') 
    print(f'Number of removed/deleted comments (has been filetered from corpus): {no_deleted_comments}.\n')                  
                
create_corpus(all_post_titles)
print(post_data)

Number of comments yielded for the corpus (that are not urls or deleted): 161.
Number of removed/deleted comments (has been filetered from corpus): 8.



In [280]:
@dataclass
class entities:
    name: str
    label: str

#strip out the pronouns, conjunctions, etc.!
# f = open('stop words.txt', 'r')
# stopwords = f.read()
# stopwords = stopwords.split('\n')

with open('stop words.txt', 'r', encoding='utf-8') as f:
    stopwords = f.read()
stopwords = stopwords.split('\n')

# Load the spaCy English & Portuguese models
en_nlp = spacy.load("en_core_web_sm")
pt_nlp = spacy.load('pt_core_news_sm')
pd.set_option("display.max_rows", 200)

#separate into tokenized sentences
tokenizer = nltk.data.load('tokenizers\punkt\english.pickle')
sentences_token = tokenizer.tokenize(corpus)
sentences = []
for sentence in sentences_token:
    if sentence.strip() not in stopwords:
        sentences.append(sentence)

#separate corpus in words
words_token = word_tokenize(corpus)
words = []
#remove any conjunctions, articles, particles, etc.
for word in words_token:
    if word.lower().strip() not in stopwords:
        words.append(word)

def checkW(x: int):
    return (x/len(words))*100

def checkS(x: int):
    return (x/len(sentences))*100

f.close()

print(f'Amount of total sentence tokens: {len(sentences)}.')
print(f'Amount of total word token: {len(words)}.\n')

  tokenizer = nltk.data.load('tokenizers\punkt\english.pickle')


Amount of total sentence tokens: 378.
Amount of total word token: 3256.



In [281]:
# METHOD 3: Lingua sentence by sentence

#import English, Portuguese, Spanish detector
languages = [Language.ENGLISH, Language.PORTUGUESE, Language.SPANISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

mixed_sentences = []
english_sentences = []
portuguese_sentences = []
spanish_sentences = []

discarded_l = 0
for sentence in sentences:
    try:
        en_l = detector.compute_language_confidence(sentence, Language.ENGLISH)
        pt_l = detector.compute_language_confidence(sentence, Language.PORTUGUESE)
        es_l = detector.compute_language_confidence(sentence, Language.SPANISH)
        if en_l > 0.8:
            english_sentences.append(sentence)
        elif pt_l > 0.8:
            portuguese_sentences.append(sentence)
        elif es_l > 0.8:
            spanish_sentences.append(sentence)
        else:
            mixed_sentences.append(sentence)
    except: 
        #discard "." or numbers
        # print("This throws an error: " + sentence)
        discarded_l +=1
        continue

# print("3. Lingua sentence by sentence")
# print(f'English sentences: {len(english_sentences)}  - {checkS(len(english_sentences)):.2f}%.')
# print(f'Portuguese sentences: {len(portuguese_sentences)} - {checkS(len(portuguese_sentences)):.2f}%.')
# print(f'Spanish sentences: {len(spanish_sentences)} - {checkS(len(spanish_sentences)):.2f}%.')
# print(f'Mixed sentences: {len(mixed_sentences)} - {checkS(len(mixed_sentences)):.2f}%.')
# print(f'Discarded: {discarded_l} - {checkS(discarded_l):.2f}%.')
# print(f'Amount detected from total: {checkS(len(english_sentences) + len(portuguese_sentences) + len(spanish_sentences)+ len(mixed_sentences)):.2f}%.\n')

# METHOD 4: Lingua word by word

en_w = []
pt_w = []
es_w = []
mixed_w = []

discard_w = 0
for word in words:
    try:
        en_l = detector.compute_language_confidence(word, Language.ENGLISH)
        pt_l = detector.compute_language_confidence(word, Language.PORTUGUESE)
        es_l = detector.compute_language_confidence(word, Language.SPANISH)
        if en_l > 0.5:
            en_w.append(word)
        elif pt_l > 0.5:
            pt_w.append(word)
        elif es_l > 0.5:
            es_w.append(word)
        else:
            mixed_w.append(word)
    except: 
        #discard "." or numbers
        # print("This throws an error: " + sentence)
        discard_w +=1
        continue

# print("Lingua word by word")
# print(f'English words: {len(en_w)} - {checkW(len(en_w)):.2f}%.')
# print(f'Portuguese words: {len(pt_w)} - {checkW(len(pt_w)):.2f}%.')
# print(f'Spanish words: {len(es_w)} - {checkW(len(es_w)):.2f}%.')
# print(f'Mixed words: {len(mixed_w)} - {checkW(len(mixed_w)):.2f}%.')
# print(f'Discarded: {discard_w} - {checkW(discard_w):.2f}%.')
# print(f'Amount detected from total: {checkW(len(en_w) + len(pt_w) + len(es_w)+ len(mixed_w)):.2f}%.\n')

# print(f'English sentences: {english_sentences}')
# print(f'Portuguese sentences: {portuguese_sentences}')
# print(f'Spanish sentences: {spanish_sentences}')
# print(f'Mixed sentences: {mixed_sentences}')

# print(f'English words: {en_w}')
# print(f'Portuguese words: {pt_w}')
# print(f'Spanish words: {es_w}')
# print(f'Mixed words: {mixed_w}')

In [282]:
total_sentences = english_sentences + mixed_sentences

with open('stop words.txt', 'r', encoding='utf-8') as f:
    stopwords = f.read()
stopwords = stopwords.split('\n')


def filter(words: list):
    result = []
    for word in words:
        if word.lower().strip() not in stopwords and len(word) != 1 and word.lower() not in result:
            result.append(word)
    return result

english_words = filter(en_w)
portuguese_words = filter(pt_w)
spanish_words = filter(es_w)
mixed_words = filter(mixed_w)

f.close()

In [283]:
@dataclass
class entities:
    name: str
    label: str

# #strip out the pronouns, conjunctions, etc.!
# f = open('stop words.txt', 'r')
# stopwords = f.read()
# stopwords = stopwords.split('\n')

with open('stop words.txt', 'r', encoding='utf-8') as f:
    stopwords = f.read()
stopwords = stopwords.split('\n')

# Load the spaCy English & Portuguese models
en_nlp = spacy.load("en_core_web_sm")
pt_nlp = spacy.load('pt_core_news_sm')

pd.set_option("display.max_rows", 200)

PeopleS = []
PeopleW = []

en_doc = en_nlp(" ".join(total_sentences))
pt_doc = pt_nlp(" ".join(spanish_sentences) + " ".join(portuguese_sentences))

for ent in en_doc.ents:
    # The output displayed the names of the entities and their predicted labels.
    if ent.text not in PeopleS and ent.label_ == 'PERSON':
        # PeopleS.append(entities(ent.text, ent.label_))
        PeopleS.append(ent.text)

print(PeopleS)

['covid19', 'Lindsay Graham', 'Biden', 'Trumps', 'https://www.nytimes.com/interactive/2020/11/03/us/elections/results-minnesota-house-district-5.html  Lmao', 'Emmet Sullivan', 'John Kruzel', 'Bill Pascrell Jr.', 'Haul Louis DeJoy', 'Louis DeJoy', 'Jo Jorgensen', 'Facebook', 'https', 'Trump', 'Lyndon B. Johnson', 'Mark Twain', 'STEM degrees', 'Vaccine', 'https://www.newsweek.com/coronavirus-donald-trump-covid-vaccine-approval-election-day-promise-1544418', 'Joe Biden', 'Beaus', 'Joe', 'Beau', 'Lindsey Graham', 'Parks', 'Rec', 'Fox', 'Covid', 'Don Jr', 'Hillary', 'Carter', 'Jimmy Carter', 'Bezos', 'Buffett', 'Susan Collins', 'https://www.salon.com/2020/11/02/president-trump-tells-advisers-that-he-fears-prosecution-if-he-loses-the-election-report/', 'LeBron James', 'Michael Bloomberg', 'Michael Bloomberg LeBron James', 'Lawrence Mower', 'Langston Taylor', 'Donald Trump', 'LeBron', 'Dixville Notch', 'Ilhan Omar', 'Fuck Facebook', '\u200b.', '\u200b. Change']


In [284]:
pattern = '|'.join(re.escape(people) for people in PeopleS)

# Use re.sub to remove the phrases from the corpus
cleaned_corpus = re.sub(pattern, '', corpus)

corpus = en_nlp(cleaned_corpus)
noun_chunks = []
for chunk in corpus.noun_chunks:
    noun_chunks.append(chunk.text)


unwanted_patterns = r'\b(my|them|me|everyone|our|even|him|her|us|itself|people|a|an|the|he|she|it|i|you|we|they|his|her|hers|its|their|theirs|this|that|these|those|there|where|who|whom|which|what|when|why|how|am|is|are|was|were|be|been|being|have|has|had|do|does|did|will|would|shall|should|can|could|may|might|must|ought|and|but|or|nor|for|yet|so|because|as|if|once|since|unless|until|while|although|though|after|before|until|by|on|about|against|between|into|through|during|before|after|above|below|to|from|up|down|in|out|on|off|over|under|again|further|then|once|here|there|when|where|why|how|all|any|both|each|few|more|most|other|some|such|no|not|only|own|same|so|than|too|very|s|t|can|will|just|don|should|now)\b|[.,!?;:[]()]'

filtered_words = []
for noun in noun_chunks:
    filtered_phrase = ' '.join(word for word in re.split(r'\s+', noun) if not re.fullmatch(unwanted_patterns, word, re.I))
    if filtered_phrase:  # Ensure it's not empty
        filtered_words.append(filtered_phrase)

leftover = []
for word in filtered_words:
    en_word = en_nlp(word)
    for ent in en_word.ents:
        #Should I add in 'ORG' tags? 'Trump' is flagged as ORG, but so does 'Congress', 'Rulers', 'FAQ'
        if ent.text not in leftover and ent.label_ == 'PERSON' or ent.label_ == 'ORG':
            # print(ent.text + " " + ent.label_)
            leftover.append(ent.text)


In [285]:
print(f'List of influencers: {PeopleS + leftover}')

filtered_words = [word for word in filtered_words if word not in leftover]

# Print the updated list
# print(f'Final leftover words: {filtered_words}')

List of influencers: ['covid19', 'Lindsay Graham', 'Biden', 'Trumps', 'https://www.nytimes.com/interactive/2020/11/03/us/elections/results-minnesota-house-district-5.html  Lmao', 'Emmet Sullivan', 'John Kruzel', 'Bill Pascrell Jr.', 'Haul Louis DeJoy', 'Louis DeJoy', 'Jo Jorgensen', 'Facebook', 'https', 'Trump', 'Lyndon B. Johnson', 'Mark Twain', 'STEM degrees', 'Vaccine', 'https://www.newsweek.com/coronavirus-donald-trump-covid-vaccine-approval-election-day-promise-1544418', 'Joe Biden', 'Beaus', 'Joe', 'Beau', 'Lindsey Graham', 'Parks', 'Rec', 'Fox', 'Covid', 'Don Jr', 'Hillary', 'Carter', 'Jimmy Carter', 'Bezos', 'Buffett', 'Susan Collins', 'https://www.salon.com/2020/11/02/president-trump-tells-advisers-that-he-fears-prosecution-if-he-loses-the-election-report/', 'LeBron James', 'Michael Bloomberg', 'Michael Bloomberg LeBron James', 'Lawrence Mower', 'Langston Taylor', 'Donald Trump', 'LeBron', 'Dixville Notch', 'Ilhan Omar', 'Fuck Facebook', '\u200b.', '\u200b. Change', 'I´m', 'WI

In [286]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Define a function to perform sentiment analysis
def analyze_sentiment(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

    # Perform inference
    outputs = model(**inputs)
    logits = outputs.logits

    # Apply softmax to get probabilities
    probs = torch.softmax(logits, dim=1).detach().numpy()[0]

    # Determine sentiment label
    label_mapping = {
        0: "Very negative",
        1: "Negative",
        2: "Neutral",
        3: "Positive",
        4: "Very positive"
    }
    sentiment_label = label_mapping[int(probs.argmax())]

    return sentiment_label, probs

negative_sentences = []
positive_sentences = []

for post in post_data:
    # Perform sentiment analysis
    text = post.post
    sentiment, probabilities = analyze_sentiment(text)
    if text.startswith('"http') or text == '"[deleted]"' or text == '"deleted"':
        post.sentiment = 'Undefined'
    else:
        post.sentiment = sentiment

# non_url_posts = [post for post in post_data if not post.post.startswith('"http')]


In [287]:
from spacy.matcher import Matcher

matcher = Matcher(en_nlp.vocab)

patterns = [
    [{'POS':'ADJ'}, {'POS':'NOUN'}],
    ]
matcher.add("demo", patterns)

for post in post_data:
    doc = en_nlp(post.post)
    x = ""
    count = 0
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = en_nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        x = x + span.text + ", "
        count += 1
        # print(x + " " + str(count))
    # Remove the last comma and space
    post.adj = x[:-2]  # Remove the last comma and space
    post.adj_count = count

In [288]:
from profanity_check import predict, predict_prob

for post in post_data:
    score = predict_prob([post.post])
    for s in score:
        post.profanity_score = s
# print(post_data)

In [292]:
# Define the path to save the CSV file
csv_file_path = folder_name + " Post-NLP.csv"

# Define the fieldnames for the CSV file
fieldnames = ["Post", "Is OG", "ID", "Likes", "Length", "Sentiment", "Profanity Score", "ADJ Count", "ADJ"]

# Write the data to the CSV file
with open(csv_file_path, mode="w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # Write the header row
    writer.writeheader()
    
    # Write each post as a row in the CSV file
    for post in post_data:
        writer.writerow({
            "Post": post.post,
            "Is OG": post.is_og,
            "ID": post.id,
            "Likes": post.likes,
            "Length": post.length,
            "Sentiment": post.sentiment,
            "Profanity Score": post.profanity_score,
            "ADJ Count": post.adj_count,
            "ADJ": post.adj
        })

print("CSV file has been successfully created.")

CSV file has been successfully created.


In [290]:
"""
span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1]
with doc.retokenize() as retokenizer:
    retokenizer.merge(span)
negative_adj = []
for token in doc:
    if token.pos_ == 'ADJ':
        print(token.text, token.pos_, token.dep_, token.head.text)
        negative_adj.append(token.text)
"""

"\nspan = doc[doc[4].left_edge.i : doc[4].right_edge.i+1]\nwith doc.retokenize() as retokenizer:\n    retokenizer.merge(span)\nnegative_adj = []\nfor token in doc:\n    if token.pos_ == 'ADJ':\n        print(token.text, token.pos_, token.dep_, token.head.text)\n        negative_adj.append(token.text)\n"