In [1]:
import csv 
import os
   
import pandas as pd 
import spacy 
# spacy.cli.download("pt_core_news_sm")
# spacy.cli.download("es_core_news_sm")

import requests 
from bs4 import BeautifulSoup
from dataclasses import dataclass

#Sentence Tokenization using sent_tokenize
import nltk
# nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
from nltk.tokenize import word_tokenize


#Detect language using detect_langs
import langdetect
from langdetect import detect_langs

#Detect language using Lingua
from lingua import Language, LanguageDetectorBuilder

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:

"""
First, I'm gonna get the entire corpus from the "Reddit Post Parsed" folder.
"""
all_post_titles = []
expected_no_comments = 0
corpus = ""
comment_urls = []
all_links = []

with open('log.csv', mode = 'r') as file:
    link_column = []
    title_column = []
    comments_column = []
    all_no_comments = []
    csvFile = csv.reader(file)
    for line in csvFile:
        title_column.append(line[2])
        all_post_titles = title_column[1:]
        comments_column.append(line[9])
        all_no_comments = comments_column[1:]
        link_column.append(line[3])
        all_links = link_column[1:]
    for number in all_no_comments:
        expected_no_comments += int(number)

#loop to open all post titles in create one big corpus of all comments
def create_corpus(titles: list) -> str:
    """
    This function takes in a list of posts titles in the 
    folder "Reddit Post Parsed" and loops through each 
    csv file to filter for proper comments, that are not urls
    and deleted to return the corpus.

    Comments that are just links will be 
    appended to the list "comment_urls"!
    """
    global corpus
    global comment_urls

    count_proper_comments = 0
    no_deleted_comments = 0
    empty = ""
    list_of_comments = []
    
    base_folder = "Update posts files"
    for title in titles:
        title_csv = os.path.join(base_folder, title + "'s post.csv")
        if not os.path.isfile(title_csv):
            print(f"File '{title_csv}' not found.")
            continue
    
        with open(title_csv, mode='r', encoding='utf-8') as file:
            csv_reader = csv.reader(file)
            for row in csv_reader:
                list_of_comments.append(empty.join(row[9:]))

    for comment in list_of_comments:
        if comment.strip() != "Body":
            if comment.strip() == '"deleted"' or comment.strip() == '"removed"':
                no_deleted_comments +=1
                comment = ""
            if comment.strip().startswith('"https:'):
                comment_urls.append(comment.replace('"', "").strip())
            else:
                count_proper_comments += 1 
                corpus = corpus + " " + comment.replace("**", "").replace("#", "").strip()[1:-1] 
    print(f'Number of comments yielded for the corpus (that are not urls or deleted): {count_proper_comments}.') 
    print(f'Number of removed/deleted comments (has been filetered from corpus): {no_deleted_comments}.\n')                  
                
create_corpus(all_post_titles)

Number of comments yielded for the corpus (that are not urls or deleted): 91.
Number of removed/deleted comments (has been filetered from corpus): 5.



In [3]:
@dataclass
class entities:
    name: str
    label: str

#strip out the pronouns, conjunctions, etc.!
f = open('stop words.txt', 'r')
stopwords = f.read()
stopwords = stopwords.split('\n')

# Load the spaCy English & Portuguese models
en_nlp = spacy.load("en_core_web_sm")
pt_nlp = spacy.load('pt_core_news_sm')
pd.set_option("display.max_rows", 200)

#separate into tokenized sentences
tokenizer = nltk.data.load('tokenizers\punkt\english.pickle')
sentences_token = tokenizer.tokenize(corpus)
sentences = []
for sentence in sentences_token:
    if sentence.strip() not in stopwords:
        sentences.append(sentence)

#separate corpus in words
words_token = word_tokenize(corpus)
words = []
#remove any conjunctions, articles, particles, etc.
for word in words_token:
    if word.lower().strip() not in stopwords:
        words.append(word)

def checkW(x: int):
    return (x/len(words))*100

def checkS(x: int):
    return (x/len(sentences))*100

f.close()

print(f'Amount of total sentence tokens: {len(sentences)}.')
print(f'Amount of total word token: {len(words)}.\n')

  tokenizer = nltk.data.load('tokenizers\punkt\english.pickle')


Amount of total sentence tokens: 171.
Amount of total word token: 1550.



In [4]:
# METHOD 1: Langdetect each sentence in the corpus

#Sentences with more than one languages detected
mixed_s = []
#Sentences with only one language detected
en_s = []
es_s = []
pt_s = []
discarded = 0
for sentence in sentences:
    #benchmark is with a language probablity over 0.5
    try:
        score = detect_langs(sentence)
        if len(score) > 1:
            mixed_s.append(sentence)
        elif len(score) == 1:
            for s in score:
                if s.lang == 'en' and s.prob >= 0.8:
                    en_s.append(sentence)
                elif s.lang == 'es' and s.prob >= 0.8:
                    es_s.append(sentence)
                elif s.lang == 'pt' and s.prob >= 0.8:
                    pt_s.append(sentence)
                else: 
                    mixed_s.append(sentence)
    except:
        #discard "." or numbers
        # print("This throws an error: " + sentence)
        discarded +=1
        continue

print("1. Langdetect sentence by sentence")
print(f'English sentences: {len(en_s)} - {checkS(len(en_s)):.2f}%.')
print(f'Portuguese sentences: {len(pt_s)} - {checkS(len(pt_s)):.2f}%.')
print(f'Spanish sentences: {len(es_s)} - {checkS(len(es_s)):.2f}%.')
print(f'Mixed sentences: {len(mixed_s)} - {checkS(len(mixed_s)):.2f}%.')
print(f'Discarded: {discarded} - {checkS(discarded):.2f}%.')
print(f'Amount detected from total: {checkS(len(en_s) + len(pt_s) + len(es_s)+ len(mixed_s)):.2f}%.\n')


# METHOD 2: Langdetect word by word
language_detected = []
for word in words:
    #benchmark is with a language probablity over 0.5
    try:
        w_score = detect_langs(word)
        for s in w_score:
            if s.lang not in language_detected and s.prob > 0.8:
                language_detected.append(s.lang)
    except:
        #discard punctuations or numbers
        # print("This throws an error: " + word)
        continue 
print("2. Langdetect word by word - just bad - all have benchmark over 0.8")
print(f'Amount of language detected: {len(language_detected)}.')
print(f'Language detected: {language_detected}.\n')

1. Langdetect sentence by sentence
English sentences: 156 - 91.23%.
Portuguese sentences: 1 - 0.58%.
Spanish sentences: 0 - 0.00%.
Mixed sentences: 14 - 8.19%.
Discarded: 0 - 0.00%.
Amount detected from total: 100.00%.

2. Langdetect word by word - just bad - all have benchmark over 0.8
Amount of language detected: 31.
Language detected: ['en', 'it', 'hu', 'id', 'es', 'ca', 'cy', 'fi', 'fr', 'sv', 'ro', 'vi', 'no', 'hr', 'nl', 'af', 'da', 'de', 'so', 'pt', 'tl', 'sw', 'sk', 'sq', 'pl', 'lv', 'et', 'sl', 'tr', 'lt', 'cs'].



In [5]:
# METHOD 3: Lingua sentence by sentence

#import English, Portuguese, Spanish detector
languages = [Language.ENGLISH, Language.PORTUGUESE, Language.SPANISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

mixed_sentences = []
english_sentences = []
portuguese_sentences = []
spanish_sentence = []

discarded_l = 0
for sentence in sentences:
    try:
        en_l = detector.compute_language_confidence(sentence, Language.ENGLISH)
        pt_l = detector.compute_language_confidence(sentence, Language.PORTUGUESE)
        es_l = detector.compute_language_confidence(sentence, Language.SPANISH)
        if en_l > 0.8:
            english_sentences.append(sentence)
        elif pt_l > 0.8:
            portuguese_sentences.append(sentence)
        elif es_l > 0.8:
            spanish_sentence.append(sentence)
        else:
            mixed_sentences.append(sentence)
    except: 
        #discard "." or numbers
        # print("This throws an error: " + sentence)
        discarded_l +=1
        continue

print("3. Lingua sentence by sentence")
print(f'English sentences: {len(english_sentences)}  - {checkS(len(english_sentences)):.2f}%.')
print(f'Portuguese sentences: {len(portuguese_sentences)} - {checkS(len(portuguese_sentences)):.2f}%.')
print(f'Spanish sentences: {len(spanish_sentence)} - {checkS(len(spanish_sentence)):.2f}%.')
print(f'Mixed sentences: {len(mixed_sentences)} - {checkS(len(mixed_sentences)):.2f}%.')
print(f'Discarded: {discarded_l} - {checkS(discarded_l):.2f}%.')
print(f'Amount detected from total: {checkS(len(english_sentences) + len(portuguese_sentences) + len(spanish_sentence)+ len(mixed_sentences)):.2f}%.\n')



# METHOD 4: Lingua word by word

en_w = []
pt_w = []
es_w = []
mixed_w = []

discard_w = 0
for word in words:
    try:
        en_l = detector.compute_language_confidence(word, Language.ENGLISH)
        pt_l = detector.compute_language_confidence(word, Language.PORTUGUESE)
        es_l = detector.compute_language_confidence(word, Language.SPANISH)
        if en_l > 0.5:
            en_w.append(word)
        elif pt_l > 0.5:
            pt_w.append(word)
        elif es_l > 0.5:
            es_w.append(word)
        else:
            mixed_w.append(word)
    except: 
        #discard "." or numbers
        # print("This throws an error: " + sentence)
        discard_w +=1
        continue
            
print("4. Lingua word by word")
print(f'English words: {len(en_w)} - {checkW(len(en_w)):.2f}%.')
print(f'Portuguese words: {len(pt_w)} - {checkW(len(pt_w)):.2f}%.')
print(f'Spanish words: {len(es_w)} - {checkW(len(es_w)):.2f}%.')
print(f'Mixed words: {len(mixed_w)} - {checkW(len(mixed_w)):.2f}%.')
print(f'Discarded: {discard_w} - {checkW(discard_w):.2f}%.')
print(f'Amount detected from total: {checkW(len(en_w) + len(pt_w) + len(es_w)+ len(mixed_w)):.2f}%.\n')


3. Lingua sentence by sentence
English sentences: 156  - 91.23%.
Portuguese sentences: 0 - 0.00%.
Spanish sentences: 0 - 0.00%.
Mixed sentences: 15 - 8.77%.
Discarded: 0 - 0.00%.
Amount detected from total: 100.00%.

4. Lingua word by word
English words: 1162 - 74.97%.
Portuguese words: 46 - 2.97%.
Spanish words: 28 - 1.81%.
Mixed words: 314 - 20.26%.
Discarded: 0 - 0.00%.
Amount detected from total: 100.00%.



In [6]:
# METHOD 5: LangID sentence by sentence

import langid
from langid.langid import LanguageIdentifier, model
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

mixed_sentences = []
english_sentences = []
portuguese_sentences = []
spanish_sentence = []

discarded_l = 0
for sentence in sentences:
    try:
        #constrain the language set
        langid.set_languages(['en','pt','es'])
        lang, conf = identifier.classify(sentence)
        if conf > 0.5 and lang == 'en':
            english_sentences.append(sentence)
        elif conf > 0.5 and lang == 'pt':
            portuguese_sentences.append(sentence)
        elif conf > 0.5 and lang == 'es':
            spanish_sentence.append(sentence)
        else:
            mixed_sentences.append(sentence)
    except: 
        #discard "." or numbers
        # print("This throws an error: " + sentence)
        discarded_l +=1
        continue
print("5. LangID sentence by sentence")
print(f'English sentences: {len(english_sentences)}  - {checkS(len(english_sentences)):.2f}%.')
print(f'Portuguese sentences: {len(portuguese_sentences)} - {checkS(len(portuguese_sentences)):.2f}%.')
print(f'Spanish sentences: {len(spanish_sentence)} - {checkS(len(spanish_sentence)):.2f}%.')
print(f'Mixed sentences: {len(mixed_sentences)} - {checkS(len(mixed_sentences)):.2f}%.')
print(f'Discarded: {discarded_l} - {checkS(discarded_l):.2f}%.')
print(f'Amount detected from total: {checkS(len(english_sentences) + len(portuguese_sentences) + len(spanish_sentence)+ len(mixed_sentences)):.2f}%.\n')


# METHOD 6: LangID word by word

en_w = []
pt_w = []
es_w = []
mixed_w = []
discard_w = 0
for word in words:
    try:
        #constrain the language set
        langid.set_languages(['en','pt','es'])
        lang, conf = identifier.classify(word)
        if conf > 0.5 and lang == 'en':
            en_w.append(word)
        elif conf > 0.8 and lang == 'pt':
            pt_w.append(word)
        elif conf > 0.8 and lang == 'es':
            es_w.append(word)
        else:
            mixed_w.append(word)
    except: 
        #discard "." or numbers
        # print("This throws an error: " + sentence)
        discard_w +=1
        continue
            
print("6. LangID word by word")
print(f'English words: {len(en_w)} - {checkW(len(en_w)):.2f}%.')
print(f'Portuguese words: {len(pt_w)} - {checkW(len(pt_w)):.2f}%.')
print(f'Spanish words: {len(es_w)} - {checkW(len(es_w)):.2f}%.')
print(f'Mixed words: {len(mixed_w)} - {checkW(len(mixed_w)):.2f}%.')
print(f'Discarded: {discard_w} - {checkW(discard_w):.2f}%.')
print(f'Amount detected from total: {checkW(len(en_w) + len(pt_w) + len(es_w)+ len(mixed_w)):.2f}%.\n')

5. LangID sentence by sentence
English sentences: 164  - 95.91%.
Portuguese sentences: 0 - 0.00%.
Spanish sentences: 0 - 0.00%.
Mixed sentences: 7 - 4.09%.
Discarded: 0 - 0.00%.
Amount detected from total: 100.00%.

6. LangID word by word
English words: 143 - 9.23%.
Portuguese words: 0 - 0.00%.
Spanish words: 1 - 0.06%.
Mixed words: 1406 - 90.71%.
Discarded: 0 - 0.00%.
Amount detected from total: 100.00%.



In [7]:
# METHOD 7: Stanza sentence by sentence

import stanza
# stanza.download(lang="multilingual")
# stanza.download(lang="en")
# stanza.download(lang="es")
# stanza.download(lang="pt")
from stanza.models.common.doc import Document
from stanza.pipeline.core import Pipeline

mixed_sentences = []
english_sentences = []
portuguese_sentences = []
spanish_sentence = []

nlp = Pipeline(lang="multilingual", processors="langid",langid_lang_subset=["en","es","pt"], langid_clean_text=True)
docs = sentences
docs = [Document([], text=text) for text in docs]
nlp(docs)
for doc in docs:
    if doc.lang == 'en':
        english_sentences.append(doc)
    elif doc.lang == 'es':
        spanish_sentence.append(doc)
    elif doc.lang == 'pt':
        portuguese_sentences.append(doc)
    else:
        mixed_sentences.append(doc)

print("7. Stanza sentence by sentence")
print(f'English sentences: {len(english_sentences)}  - {checkS(len(english_sentences)):.2f}%.')
print(f'Portuguese sentences: {len(portuguese_sentences)} - {checkS(len(portuguese_sentences)):.2f}%.')
print(f'Spanish sentences: {len(spanish_sentence)} - {checkS(len(spanish_sentence)):.2f}%.')
print(f'Mixed sentences: {len(mixed_sentences)} - {checkS(len(mixed_sentences)):.2f}%.')
print(f'Amount detected from total: {checkS(len(english_sentences) + len(portuguese_sentences) + len(spanish_sentence)+ len(mixed_sentences)):.2f}%.\n') 

# METHOD 8: Stanza word by word
en_w = []
pt_w = []
es_w = []
mixed_w = []
discard_w = 0

nlp = Pipeline(lang="multilingual", processors="langid",langid_lang_subset=["en","es","pt"], langid_clean_text=True)
docs = words
docs = [Document([], text=text) for text in docs]
nlp(docs)

for doc in docs:
    if doc.lang == 'en':
        en_w.append(doc)
    elif doc.lang == 'es':
        es_w.append(doc)
    elif doc.lang == 'pt':
        pt_w.append(doc)
    else:
        mixed_w.append(doc)
            
print("8. Stanza word by word")
print(f'English words: {len(en_w)} - {checkW(len(en_w)):.2f}%.')
print(f'Portuguese words: {len(pt_w)} - {checkW(len(pt_w)):.2f}%.')
print(f'Spanish words: {len(es_w)} - {checkW(len(es_w)):.2f}%.')
print(f'Mixed words: {len(mixed_w)} - {checkW(len(mixed_w)):.2f}%.')
print(f'Amount detected from total: {checkW(len(en_w) + len(pt_w) + len(es_w)+ len(mixed_w)):.2f}%.\n')

  from .autonotebook import tqdm as notebook_tqdm
2024-05-08 23:08:44 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 44.1MB/s]                    
2024-05-08 23:08:45 INFO: Downloaded file to /Users/blakey/stanza_resources/resources.json
2024-05-08 23:08:45 INFO: Loading these models for language: multilingual ():
| Processor | Package |
-----------------------
| langid    | ud      |

2024-05-08 23:08:45 INFO: Using device: cpu
2024-05-08 23:08:45 INFO: Loading: langid
2024-05-08 23:08:45 INFO: Done loading processors!
2024-05-08 23:08:46 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURC

7. Stanza sentence by sentence
English sentences: 170  - 99.42%.
Portuguese sentences: 0 - 0.00%.
Spanish sentences: 1 - 0.58%.
Mixed sentences: 0 - 0.00%.
Amount detected from total: 100.00%.



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 19.0MB/s]                    
2024-05-08 23:08:46 INFO: Downloaded file to /Users/blakey/stanza_resources/resources.json
2024-05-08 23:08:46 INFO: Loading these models for language: multilingual ():
| Processor | Package |
-----------------------
| langid    | ud      |

2024-05-08 23:08:46 INFO: Using device: cpu
2024-05-08 23:08:46 INFO: Loading: langid
2024-05-08 23:08:46 INFO: Done loading processors!


8. Stanza word by word
English words: 1336 - 86.19%.
Portuguese words: 146 - 9.42%.
Spanish words: 68 - 4.39%.
Mixed words: 0 - 0.00%.
Amount detected from total: 100.00%.



In [9]:
# METHOD 9: xlm-roberta-base-language-detection sentence by sentence

# METHOD 10: xlm-roberta-base-language-detection word by word

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def load_model_and_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
    model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
    return tokenizer, model

def predict_language(text, tokenizer, model):
    # Encode the text using the tokenizer
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    # Predict the language using the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the predicted language ID (the highest probability)
    logits = outputs.logits
    predicted_id = torch.argmax(logits, dim=1).item()
    
    # Convert the language ID to language code
    labels = model.config.id2label
    predicted_language = labels[predicted_id]
    
    return predicted_language
#sentences
mixed_sentences = []
english_sentences = []
portuguese_sentences = []
spanish_sentence = []

#words
en_w = []
pt_w = []
es_w = []
mixed_w = []

def main():
    # Load the tokenizer and model
    tokenizer, model = load_model_and_tokenizer()
    
    # Example text to detect the language

    for sentence in sentences:
        # Predict the language
        language = predict_language(sentence, tokenizer, model)
        if language == 'en':
            english_sentences.append(sentence)
        elif language == 'es':
            spanish_sentence.append(sentence)
        elif language == 'pt':
            portuguese_sentences.append(sentence)
        else:
            mixed_sentences.append(sentence)

    for word in words:
        # Predict the language
        language = predict_language(word, tokenizer, model)
        if language == 'en':
            en_w.append(word)
        elif language == 'es':
            es_w.append(word)
        elif language == 'pt':
            pt_w.append(word)
        else:
            mixed_w.append(word)
if __name__ == "__main__":
    main()

print("9. xlm-roberta-base-language-detection sentence by sentence")
print(f'English sentences: {len(english_sentences)}  - {checkS(len(english_sentences)):.2f}%.')
print(f'Portuguese sentences: {len(portuguese_sentences)} - {checkS(len(portuguese_sentences)):.2f}%.')
print(f'Spanish sentences: {len(spanish_sentence)} - {checkS(len(spanish_sentence)):.2f}%.')
print(f'Mixed sentences: {len(mixed_sentences)} - {checkS(len(mixed_sentences)):.2f}%.')
print(f'Amount detected from total: {checkS(len(english_sentences) + len(portuguese_sentences) + len(spanish_sentence)+ len(mixed_sentences)):.2f}%.\n') 

print("10. xlm-roberta-base-language-detection word by word")
print(f'English words: {len(en_w)} - {checkW(len(en_w)):.2f}%.')
print(f'Portuguese words: {len(pt_w)} - {checkW(len(pt_w)):.2f}%.')
print(f'Spanish words: {len(es_w)} - {checkW(len(es_w)):.2f}%.')
print(f'Mixed words: {len(mixed_w)} - {checkW(len(mixed_w)):.2f}%.')
print(f'Amount detected from total: {checkW(len(en_w) + len(pt_w) + len(es_w)+ len(mixed_w)):.2f}%.\n')