In [1]:
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
DetectorFactory.seed = 0

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'unknown'

In [2]:
df = pd.read_csv("cleaned_train_split_2_Ambra.csv")

In [3]:
df['language'] = df['cleaned_text'].apply(detect_language)

In [4]:
print(df['language'].value_counts())

language
en    13311
de      427
nl      195
fr      158
hr      156
it      124
fi      110
ro      105
es       90
el       89
sv       89
da       88
pt       79
no       76
bg       46
pl       43
hu       38
tr       36
cs       26
sl       19
lt       13
et       11
ru        8
sk        7
sq        5
ca        2
Name: count, dtype: int64


In [10]:
df['english'] = df['language'].apply(lambda x: 1 if x == 'en' else 0)
df.to_csv("cleaned_train_split_2_Ambra_with_language_labels.csv", index=False)

We will now proceed with the following replacements:
- Replace foreign words in English posts with the code: `RPLCMNT_FRGNWRD`
- Replace non-English posts with the code: `RPLCMNT_NNGLSH`

In [6]:
# Creating the replacement column
df['replacement'] = df['english'].apply(lambda x: 'null' if x == 1 else 'RPLCMNT_NNGLSH')

In [1]:
import language_tool_python
import pandas as pd
df = pd.read_csv("cleaned_train_split_2_Ambra_with_language_labels.csv")
tool = language_tool_python.LanguageTool('en-US')


# Correcting the grammar in a post
def correct_grammar(text):
    return tool.correct(text)


def grammar_correction(row):
    if row['english'] == 1:
        return correct_grammar(row['cleaned_text'])
    else:
        return 'null'
# Creating the grammar corrected column
df['grammar_corrected_text'] = df.apply(grammar_correction, axis=1)

df.to_csv("cleaned_train_split_2_Ambra_with_grammar_correction.csv", index=False)

Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:09<00:00, 26.5MB/s] 
Unzipping C:\Users\gvesc\AppData\Local\Temp\tmpk07ei391.zip to C:\Users\gvesc\.cache\language_tool_python.
Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to C:\Users\gvesc\.cache\language_tool_python.


In [1]:
import pandas as pd

# Load the CSV file
data = pd.read_csv("cleaned_train_split_2_Ambra_with_grammar_correction.csv")

# Replace null values in 'grammar_corrected_text' with values from 'cleaned_text'
data['grammar_corrected_text'] = data['grammar_corrected_text'].fillna(data['cleaned_text'])

# Save the modified DataFrame back to the same CSV file
data.to_csv("cleaned_train_split_2_Ambra_with_grammar_correction.csv", index=False)


In [2]:
import pandas as pd

df = pd.read_csv("cleaned_train_split_2_Ambra_with_grammar_correction.csv")
print(df.columns)

Index(['Unnamed: 0', 'auhtor_ID', 'post', 'nationality', 'cleaned_text',
       'original_word_count', 'cleaned_word_count', 'word_count_difference',
       'language', 'english', 'replacement', 'grammar_corrected_text'],
      dtype='object')


In [3]:
# import pandas as pd
# import nltk
# from nltk.corpus import wordnet
# import spacy
# import re
#
# # Loading SpaCy English language model
# nlp = spacy.load("en_core_web_sm")
# # Ensure to run: python -m spacy download en_core_web_sm
# # Downloading WordNet data
# nltk.download("wordnet")
#
# # Adding common internet acronyms
# internet_acronyms = {
#     "lol", "brb", "omg", "idk", "btw", "smh", "imo", "fyi", "tbh", "irl", "np",
#     "thx", "jk", "omw", "rofl", "wtf", "afaik", "asap", "bff", "ftw", "lmao",
#     "ngl", "nsfw", "tl;dr", "dm", "rt", "pm", "gg", "wp", "faq"
# }
#
# # Defining replacement code
# FOREIGN_WORD_CODE = "RPLCMNT_FRGNWRD"
#
# # Checking if a word is valid English
# def is_english_word(word):
#     # Check if the word exists in WordNet
#     if wordnet.synsets(word.lower()):
#         return True
#     # Use SpaCy to check if the word is English
#     doc = nlp(word)
#     if doc[0].is_alpha and doc[0].lang_ == "en":
#         return True
#     # Check against internet acronyms
#     if word.lower() in internet_acronyms:
#         return True
#     return False
#
# # Detecting and replacing foreign words
# def detect_and_replace(row):
#     # Ensure the necessary columns exist
#     if 'grammar_corrected_text' not in row or 'cleaned_text' not in row or 'english' not in row:
#         raise KeyError("One or more required columns are missing: 'grammar_corrected_text', 'cleaned_text', 'english'")
#
#     if row['english'] == 1:  # Processing only English rows
#         corrected_words = re.findall(r'\b\w+\b', row['grammar_corrected_text'])
#         cleaned_words = re.findall(r'\b\w+\b', row['cleaned_text'])
#
#         # Identifying foreign words
#         foreign_words = set()
#         for i, word in enumerate(corrected_words):
#             if word.isdigit():  # Skipping numbers
#                 continue
#             if i > 0 and word[0].isupper():  # Skipping proper nouns mid-sentence
#                 continue
#             if is_english_word(word):
#                 continue
#             foreign_words.add(word)
#
#         # Replacing in cleaned_text
#         replaced_text = [
#             FOREIGN_WORD_CODE if word in foreign_words else word for word in cleaned_words
#         ]
#         return ' '.join(replaced_text)
#     else:
#         return "RPLCMNT_NNGLSH"  # Handling non-English rows
#
# # Verifying the presence of required columns
# required_columns = ['grammar_corrected_text', 'cleaned_text', 'english']
# missing_columns = [col for col in required_columns if col not in df.columns]
# if missing_columns:
#     raise KeyError(f"The following required columns are missing from the DataFrame: {missing_columns}")
#
# # Verifying columns before processing
# print("Columns before processing:", df.columns)
#
# # Applying replacement logic
# df['replacement'] = df.apply(detect_and_replace, axis=1)
#
# # Verifying columns after processing
# print("Columns after processing:", df.columns)
#
# # Saving updated dataset
# df.to_csv("cleaned_train_split_2_Ambra_exp3_optimized.csv", index=False)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gvesc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Columns before processing: Index(['Unnamed: 0', 'auhtor_ID', 'post', 'nationality', 'cleaned_text',
       'original_word_count', 'cleaned_word_count', 'word_count_difference',
       'language', 'english', 'replacement', 'grammar_corrected_text'],
      dtype='object')
Columns after processing: Index(['Unnamed: 0', 'auhtor_ID', 'post', 'nationality', 'cleaned_text',
       'original_word_count', 'cleaned_word_count', 'word_count_difference',
       'language', 'english', 'replacement', 'grammar_corrected_text'],
      dtype='object')


In [4]:
# df

Unnamed: 0.1,Unnamed: 0,auhtor_ID,post,nationality,cleaned_text,original_word_count,cleaned_word_count,word_count_difference,language,english,replacement,grammar_corrected_text
0,0,t2_1e98hr7q,paperclip een heel eind. Ik heb het tot nu toe...,The Netherlands,paperclip een heel eind. Ik heb het tot nu toe...,1500,1472,28,nl,0,RPLCMNT_NNGLSH,
1,1,t2_raxhwbba,2: Band of Thieves. Sly 3: Honour Among Thieve...,United Kingdom,2: Band of Thieves. Sly 3: Honour Among Thieve...,1500,1481,19,en,1,2 Band of Thieves Sly 3 Honour Among Thieves R...,2: Band of Thieves. Sly 3: Honor Among Thieves...
2,2,t2_3fw68gzc,фалшиви и ги забраниха. След ден затвориха и л...,Bulgaria,фалшиви и ги забраниха. След ден затвориха и л...,1500,1493,7,en,1,фалшиви и ги забраниха След ден затвориха и ла...,Фалшиви и ги забраниха. След ден затвориха и л...
3,3,t2_qxtt1jsp,it the Critical Drinker who did it? Cartoon? L...,Portugal,it the Critical Drinker who did it? Cartoon? L...,539,521,18,en,1,it the Critical Drinker who did it Cartoon Lol...,It the Critical Drinker who did it? Cartoon? L...
4,4,t2_10jg8ipm,showarok nolo'atli Won apa-te kwo'atli sho asi...,Spain,showarok nolo'atli Won apa-te kwo'atli sho asi...,1500,1489,11,en,1,showarok nolo atli Won apa te kwo atli sho asi...,Showarok solo'tali Won apart quo'tali who Asia...
...,...,...,...,...,...,...,...,...,...,...,...,...
15346,15346,t2_422adf8q,Wenn bis zum Ausbruch der hypothetischen Epsil...,Austria,Wenn bis zum Ausbruch der hypothetischen Epsil...,1500,1489,11,de,0,RPLCMNT_NNGLSH,
15347,15347,t2_ofl1u5f,"was handed to them, that is then dropped by th...",Romania,"was handed to them, that is then dropped by th...",1500,1484,16,en,1,was handed to them that is then dropped by the...,"Was handed to them, that is then dropped by th..."
15348,15348,t2_3edl7,experience this when you do assembly programmi...,Germany,experience this when you do assembly programmi...,1500,1465,35,en,1,experience this when you do assembly programmi...,Experience this when you do assembly programmi...
15349,15349,t2_au7t1,us for a while now (such as [this](url from 20...,United Kingdom,us for a while now (such as [this]( from 2018)...,1500,1481,19,en,1,us for a while now such as this from 2018 I ve...,Us for a while now (such as [this](from 2018)....


In [42]:
#print(df.columns)

Index(['Unnamed: 0', 'auhtor_ID', 'post', 'nationality', 'cleaned_text',
       'original_word_count', 'cleaned_word_count', 'word_count_difference'],
      dtype='object')
