In [22]:
import csv 
import os
   
import pandas as pd 
import spacy 
# spacy.cli.download("pt_core_news_sm")
# spacy.cli.download("es_core_news_sm")

import requests 
from bs4 import BeautifulSoup
from dataclasses import dataclass

#Sentence Tokenization using sent_tokenize
import nltk
# nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
from nltk.tokenize import word_tokenize


#Detect language using detect_langs
import langdetect
from langdetect import detect_langs

#Detect language using Lingua
from lingua import Language, LanguageDetectorBuilder

import re

In [23]:

"""
First, I'm gonna get the entire corpus from the "Reddit Post Parsed" folder.
"""
all_post_titles = []
expected_no_comments = 0
corpus = ""
comment_urls = []
all_links = []

with open('log.csv', mode = 'r') as file:
    link_column = []
    title_column = []
    comments_column = []
    all_no_comments = []
    csvFile = csv.reader(file)
    for line in csvFile:
        title_column.append(line[2])
        all_post_titles = title_column[1:]
        comments_column.append(line[9])
        all_no_comments = comments_column[1:]
        link_column.append(line[3])
        all_links = link_column[1:]
    for number in all_no_comments:
        expected_no_comments += int(number)

#loop to open all post titles in create one big corpus of all comments
def create_corpus(titles: list) -> str:
    """
    This function takes in a list of posts titles in the 
    folder "Reddit Post Parsed" and loops through each 
    csv file to filter for proper comments, that are not urls
    and deleted to return the corpus.

    Comments that are just links will be 
    appended to the list "comment_urls"!
    """
    global corpus
    global comment_urls

    count_proper_comments = 0
    no_deleted_comments = 0
    empty = ""
    list_of_comments = []
    
    base_folder = "Update posts files"
    for title in titles:
        title_csv = os.path.join(base_folder, title + "'s post.csv")
        if not os.path.isfile(title_csv):
            print(f"File '{title_csv}' not found.")
            continue
    
        with open(title_csv, mode='r', encoding='utf-8') as file:
            csv_reader = csv.reader(file)
            for row in csv_reader:
                list_of_comments.append(empty.join(row[9:]))

    for comment in list_of_comments:
        if comment.strip() != "Body":
            if comment.strip() == '"deleted"' or comment.strip() == '"removed"':
                no_deleted_comments +=1
                comment = ""
            if comment.strip().startswith('"https:'):
                comment_urls.append(comment.replace('"', "").strip())
            else:
                count_proper_comments += 1 
                corpus = corpus + " " + comment.replace("**", "").replace("#", "").strip()[1:-1] 
    print(f'Number of comments yielded for the corpus (that are not urls or deleted): {count_proper_comments}.') 
    print(f'Number of removed/deleted comments (has been filetered from corpus): {no_deleted_comments}.\n')                  
                
create_corpus(all_post_titles)

Number of comments yielded for the corpus (that are not urls or deleted): 91.
Number of removed/deleted comments (has been filetered from corpus): 5.



In [24]:
@dataclass
class entities:
    name: str
    label: str

#strip out the pronouns, conjunctions, etc.!
# f = open('stop words.txt', 'r')
# stopwords = f.read()
# stopwords = stopwords.split('\n')

with open('stop words.txt', 'r', encoding='utf-8') as f:
    stopwords = f.read()
stopwords = stopwords.split('\n')

# Load the spaCy English & Portuguese models
en_nlp = spacy.load("en_core_web_sm")
pt_nlp = spacy.load('pt_core_news_sm')
pd.set_option("display.max_rows", 200)

#separate into tokenized sentences
tokenizer = nltk.data.load('tokenizers\punkt\english.pickle')
sentences_token = tokenizer.tokenize(corpus)
sentences = []
for sentence in sentences_token:
    if sentence.strip() not in stopwords:
        sentences.append(sentence)

#separate corpus in words
words_token = word_tokenize(corpus)
words = []
#remove any conjunctions, articles, particles, etc.
for word in words_token:
    if word.lower().strip() not in stopwords:
        words.append(word)

def checkW(x: int):
    return (x/len(words))*100

def checkS(x: int):
    return (x/len(sentences))*100

f.close()

print(f'Amount of total sentence tokens: {len(sentences)}.')
print(f'Amount of total word token: {len(words)}.\n')

  tokenizer = nltk.data.load('tokenizers\punkt\english.pickle')


Amount of total sentence tokens: 171.
Amount of total word token: 1533.



In [25]:
# METHOD 3: Lingua sentence by sentence

#import English, Portuguese, Spanish detector
languages = [Language.ENGLISH, Language.PORTUGUESE, Language.SPANISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

mixed_sentences = []
english_sentences = []
portuguese_sentences = []
spanish_sentence = []

discarded_l = 0
for sentence in sentences:
    try:
        en_l = detector.compute_language_confidence(sentence, Language.ENGLISH)
        pt_l = detector.compute_language_confidence(sentence, Language.PORTUGUESE)
        es_l = detector.compute_language_confidence(sentence, Language.SPANISH)
        if en_l > 0.8:
            english_sentences.append(sentence)
        elif pt_l > 0.8:
            portuguese_sentences.append(sentence)
        elif es_l > 0.8:
            spanish_sentence.append(sentence)
        else:
            mixed_sentences.append(sentence)
    except: 
        #discard "." or numbers
        # print("This throws an error: " + sentence)
        discarded_l +=1
        continue

print("3. Lingua sentence by sentence")
print(f'English sentences: {len(english_sentences)}  - {checkS(len(english_sentences)):.2f}%.')
print(f'Portuguese sentences: {len(portuguese_sentences)} - {checkS(len(portuguese_sentences)):.2f}%.')
print(f'Spanish sentences: {len(spanish_sentence)} - {checkS(len(spanish_sentence)):.2f}%.')
print(f'Mixed sentences: {len(mixed_sentences)} - {checkS(len(mixed_sentences)):.2f}%.')
print(f'Discarded: {discarded_l} - {checkS(discarded_l):.2f}%.')
print(f'Amount detected from total: {checkS(len(english_sentences) + len(portuguese_sentences) + len(spanish_sentence)+ len(mixed_sentences)):.2f}%.\n')



# METHOD 4: Lingua word by word

en_w = []
pt_w = []
es_w = []
mixed_w = []

discard_w = 0
for word in words:
    try:
        en_l = detector.compute_language_confidence(word, Language.ENGLISH)
        pt_l = detector.compute_language_confidence(word, Language.PORTUGUESE)
        es_l = detector.compute_language_confidence(word, Language.SPANISH)
        if en_l > 0.5:
            en_w.append(word)
        elif pt_l > 0.5:
            pt_w.append(word)
        elif es_l > 0.5:
            es_w.append(word)
        else:
            mixed_w.append(word)
    except: 
        #discard "." or numbers
        # print("This throws an error: " + sentence)
        discard_w +=1
        continue

print("4. Lingua word by word")
print(f'English words: {len(en_w)} - {checkW(len(en_w)):.2f}%.')
print(f'Portuguese words: {len(pt_w)} - {checkW(len(pt_w)):.2f}%.')
print(f'Spanish words: {len(es_w)} - {checkW(len(es_w)):.2f}%.')
print(f'Mixed words: {len(mixed_w)} - {checkW(len(mixed_w)):.2f}%.')
print(f'Discarded: {discard_w} - {checkW(discard_w):.2f}%.')
print(f'Amount detected from total: {checkW(len(en_w) + len(pt_w) + len(es_w)+ len(mixed_w)):.2f}%.\n')

print(f'English sentences: {english_sentences}')
print(f'Portuguese sentences: {portuguese_sentences}')
print(f'Spanish sentences: {spanish_sentence}')
print(f'Mixed sentences: {mixed_sentences}')

print(f'English words: {en_w}')
print(f'Portuguese words: {pt_w}')
print(f'Spanish words: {es_w}')
print(f'Mixed words: {mixed_w}')

3. Lingua sentence by sentence
English sentences: 156  - 91.23%.
Portuguese sentences: 0 - 0.00%.
Spanish sentences: 0 - 0.00%.
Mixed sentences: 15 - 8.77%.
Discarded: 0 - 0.00%.
Amount detected from total: 100.00%.

4. Lingua word by word
English words: 1158 - 75.54%.
Portuguese words: 46 - 3.00%.
Spanish words: 28 - 1.83%.
Mixed words: 301 - 19.63%.
Discarded: 0 - 0.00%.
Amount detected from total: 100.00%.

English sentences: [' “In my house there was a pile of documents to be discarded where most likely the material described in the article was found” he added.', '“Everything would be taken to be shredded at the Ministry of Justice in due course.”.so a coup attempt and a coverup?', 'Bolsonaro imitated Trump to the point of plotting coup.', 'He should be sent back to Brazil to face trial for trying to overturn election result.', 'Hes talking about the election or the game?', '800 thousand dead brazilians worst economic situation since the Real currency government frozen due to lack 

In [26]:
total_sentences = english_sentences + mixed_sentences

with open('stop words.txt', 'r', encoding='utf-8') as f:
    stopwords = f.read()
stopwords = stopwords.split('\n')


def filter(words: list):
    result = []
    for word in words:
        if word.lower().strip() not in stopwords and len(word) != 1 and word.lower() not in result:
            result.append(word)
    return result

english_words = filter(en_w)
portuguese_words = filter(pt_w)
spanish_words = filter(es_w)
mixed_words = filter(mixed_w)

f.close()

In [39]:
import csv 
import os
   
import pandas as pd 
import spacy 
# spacy.cli.download("pt_core_news_sm")
# spacy.cli.download("es_core_news_sm")

import requests 
from bs4 import BeautifulSoup
from dataclasses import dataclass

#Sentence Tokenization using sent_tokenize
import nltk
# nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
from nltk.tokenize import word_tokenize

#Detect language using detect_langs
import langdetect
from langdetect import detect_langs

#Detect language using Lingua
from lingua import Language, LanguageDetectorBuilder

@dataclass
class entities:
    name: str
    label: str

# #strip out the pronouns, conjunctions, etc.!
# f = open('stop words.txt', 'r')
# stopwords = f.read()
# stopwords = stopwords.split('\n')

with open('stop words.txt', 'r', encoding='utf-8') as f:
    stopwords = f.read()
stopwords = stopwords.split('\n')

# Load the spaCy English & Portuguese models
en_nlp = spacy.load("en_core_web_sm")
pt_nlp = spacy.load('pt_core_news_sm')

pd.set_option("display.max_rows", 200)

PeopleS = []
PeopleW = []

en_doc = en_nlp(" ".join(total_sentences))
# pt_doc = pt_nlp()

for ent in en_doc.ents:
    # The output displayed the names of the entities and their predicted labels.
    if ent.text not in PeopleS and ent.label_ == 'PERSON':
        # PeopleS.append(entities(ent.text, ent.label_))
        PeopleS.append(ent.text)

print(PeopleS)

for word in english_words:
    en_word = en_nlp(word)
    for ent in en_word.ents:
        if ent.text not in PeopleW and ent.label_ == 'PERSON':
            PeopleW.append(ent.text)

#NER with words - not preferable - just putting here as a test
for word in portuguese_words:
    pt_word = pt_nlp(word)
    for ent in pt_word.ents:
        if ent.text not in PeopleW and ent.label_ == 'PER' or ent.label_ == 'LOC':
            PeopleW.append(ent.text)

for word in spanish_words:
    es_word = pt_nlp(word)
    for ent in es_word.ents:
        if ent.text not in PeopleW and ent.label_ == 'PER' or ent.label_ == 'LOC':
            PeopleW.append(ent.text)

# print(PeopleW)

['Bolsonaro', 'Lula', 'Erysipelas', 'Biden', 'Putin', 'Santos-Cruz', 'Hamilton Mourao', 'Jair Bolsonaro', 'Luiz Inacio Lula da Silva', 'Lulas', 'Silvinei Vasques', 'Lula Bolsonaro', 'Cerberusz', 'Storming', 'Bannon', 'Lyon', 'Netflix Neymar', 'Jr', 'Neymar Sr', 'Mbappé', 'Neymar', 'Neymar Pulling', 'Death Arrest Win', 'da Silva', 'GTFO']


In [56]:
pattern = '|'.join(re.escape(people) for people in PeopleS)

# Use re.sub to remove the phrases from the corpus
cleaned_corpus = re.sub(pattern, '', corpus.text)

corpus = en_nlp(cleaned_corpus)
noun_chunks = []
for chunk in corpus.noun_chunks:
    noun_chunks.append(chunk.text)



unwanted_patterns = r'\b(my|them|me|everyone|our|even|him|her|us|itself|people|a|an|the|he|she|it|i|you|we|they|his|her|hers|its|their|theirs|this|that|these|those|there|where|who|whom|which|what|when|why|how|am|is|are|was|were|be|been|being|have|has|had|do|does|did|will|would|shall|should|can|could|may|might|must|ought|and|but|or|nor|for|yet|so|because|as|if|once|since|unless|until|while|although|though|after|before|until|by|on|about|against|between|into|through|during|before|after|above|below|to|from|up|down|in|out|on|off|over|under|again|further|then|once|here|there|when|where|why|how|all|any|both|each|few|more|most|other|some|such|no|not|only|own|same|so|than|too|very|s|t|can|will|just|don|should|now)\b|[.,!?;:[]()]'

filtered_words = []
for noun in noun_chunks:
    filtered_phrase = ' '.join(word for word in re.split(r'\s+', noun) if not re.fullmatch(unwanted_patterns, word, re.I))
    if filtered_phrase:  # Ensure it's not empty
        filtered_words.append(filtered_phrase)

# Print the filtered list
# print(filtered_words)
leftover = []
for word in filtered_words:
    en_word = en_nlp(word)
    for ent in en_word.ents:
        if ent.text not in PeopleW and ent.label_ == 'PERSON' or ent.label_ == 'ORG':
            print(ent.text + " " + ent.label_)
            leftover.append(ent.text)
print(leftover)


Chad Juninho ORG
['Chad Juninho']
