In [35]:
!pip install emoji contractions

[0m

In [36]:
import re
import nltk
import emoji
import pandas as pd
import contractions

In [37]:
import json

# Load the JSON file into a Python dictionary
with open('data/emoticon_dict.json', 'r') as file:
    emotion_dict = json.load(file)

def replace_emoticons_with_emotions(text, emotion_dict):
    for emoticon, emotion in emotion_dict.items():
        text = text.replace(emoticon, emotion)
    return text

In [38]:
def convert_emojis_to_text(text):
    return emoji.demojize(text)

def contains_emoji(text):
    # Create a regular expression for emojis
    emoji_regex = emoji.get_emoji_regexp()

    # Check if the text contains an emoji
    if emoji_regex.search(text):
        return True
    else:
        return False

# Example usage
text = "I'm so happy :) but I'm also frustrated 😠 and confused 😕."
text_with_emojis_converted = convert_emojis_to_text(text)
print(text_with_emojis_converted)

I'm so happy :) but I'm also frustrated :angry_face: and confused :confused_face:.


In [39]:
import re
import csv

def create_dict_from_csv(file_name):
    dictionary = {}
    with open(file_name, mode='r') as file:
        csv_reader = csv.reader(file)
        next(csv_reader) # Skip the header row
        for row in csv_reader:
            key = row[0]
            value = row[1]
            dictionary[key] = value
    return dictionary

abbreviations_dict = create_dict_from_csv('data/abbv.csv')


In [40]:
def expand_abbreviations(text):
    for abbr, full_form in abbreviations_dict.items():
        # Use regex to match the abbreviation as a whole word to avoid partial matches
        text = re.sub(r'\b' + abbr + r'\b', full_form, text)
    return text

In [41]:
def custom_correct_spelling(text, correct_words):
    for word in correct_words:
        if word in text:
            text = text.replace(word, correct_words[word])
    return text

correct_words = {"hapy": "happy"}

In [42]:
import re

def remove_repeated_chars(s):
    # Match a character followed by at least 2 repetitions and replace with just two repetitions
    return re.sub(r'(\w)\1{2,}', r'\1\1', s)

def remove_spaces(text):
    # Regex pattern to find spaces between letters
    pattern = r"(?<=\b\w) (?=\w\b)"
    # Using re.sub to replace the matched spaces with an empty string
    text = re.sub(pattern, '', text)
    return text

In [43]:
def remove_hyphenated_capitalized_words(text):
    pattern = r'\b([A-Z][a-z]*-)*([A-Z][a-z]*)\b'
    result = re.sub(pattern, lambda match: match.group(0).replace('-', ''), text)
    return result

def normalize_hashtags(hashtags):
    processed_hashtags = []
    for hashtag in hashtags:
        # Insert spaces before each capital letter
        processed_hashtag = re.sub(r'([A-Z])', r' \1', hashtag)
        processed_hashtags.append(processed_hashtag)
    return processed_hashtags

In [44]:
def preprocess_text(text):
    
    # Remove hashtags and seperate each word
    hashtag_pattern = r'#(\w+)'
    hashtags = re.findall(hashtag_pattern, text)
    processed_hashtags = normalize_hashtags(hashtags)
    for i, hashtag in enumerate(hashtags):
        text = text.replace(hashtag, processed_hashtags[i])
        
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+|\S+\.ly|\S+\.ph|\S+\.net', '', text, flags=re.MULTILINE)

    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove newline characters
    text = text.replace(r"\n", " ")

    # Replace &amp; to and characters
    text = text.replace(r"&amp;", "and")
    
    text = re.sub(r'\.', ' ', text)    
    
    #remove hyphenated
    text = remove_hyphenated_capitalized_words(text)

    #Remove all caps
    text = text.lower()

    # Expand Abbrevations
    text = expand_abbreviations(text)

    # Remove Spaces between letters like e v e r y t h i n g
    text = remove_spaces(text)
        
    # Fix contractions from don't to do not
    text = contractions.fix(text)

    # Remove special characters and numbers
    text = re.sub(r'\/', ' or ', text)
    text = re.sub(r'\W', ' ', text)

    # # Remove repeated characters like happppyyyyy to hapy
    text= remove_repeated_chars(text)

    # # Corrects the word to its correct spelling to happy
    text = custom_correct_spelling(text, correct_words)

    # Replace nan to "" characters
    text = text.replace(r"nan", "")
    
    # Convert emojis to textual representation
    if contains_emoji:
        text = convert_emojis_to_text(text)
        
    return text


In [45]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [46]:
def load_stopwords_from_csv(file_path):
    stopwords_list = pd.read_csv(file_path)['word'].tolist()
    return stopwords_list

def customize_stopwords(stopwords):
    # Load the custom stopwords from CSV files
    interjections = load_stopwords_from_csv('algo/interjections.csv')
    negations = load_stopwords_from_csv('algo/negations.csv')
    amplifiers = load_stopwords_from_csv('algo/amplifiers.csv')

    # Combine the custom stopwords into a single list
    custom_stopwords = interjections + negations + amplifiers
    # Remove the custom stopwords from the NLTK stopwords list
    filtered_stopwords = [word for word in stopwords if word not in custom_stopwords]

    return filtered_stopwords

In [47]:
# Initialize the DistilBert tokenizer
lemmatizer = WordNetLemmatizer()
emotionlex_df = pd.read_csv('algo/emolex_words.csv')

def tokenize(text):
    # Tokenize the text into words
    word_tokens = word_tokenize(text)

    nltk_stopwords = stopwords.words('english')
    customized_stopwords = customize_stopwords(nltk_stopwords)

    filtered_tokens = [token for token in word_tokens if token not in customized_stopwords]

    lemmatized_tokens = []
    for word in filtered_tokens:
        if word in emotionlex_df['word'].tolist():  # Check if word exists in NRC emotion lexicon (assuming 'word' is the column name)
            lemmatized_tokens.append(word)  # Don't lemmatize, keep the original word
        else:
            lemmatized_tokens.append(lemmatizer.lemmatize(word))  # Lemmatize other words

    return lemmatized_tokens


Applying Preprocessing and Tokenizing to dataset

In [48]:
# Load the dataset
df = pd.read_csv('dataset/tweets-train.csv')
df['processed'] = df['Tweet'].apply(preprocess_text)
df['processed'] = df['processed'].apply(tokenize)
df['processed']=df['processed'].apply(lambda x: ' '.join(x).replace('\\n',''))
df.drop(columns=['Tweet'], inplace=True)

# Get the list of all column names
all_columns = df.columns.tolist()
# Move 'Processed' to the first position
all_columns.insert(0, all_columns.pop(all_columns.index('processed')))
# Reorder the columns
df = df.reindex(columns=all_columns)

df.to_csv('processed/clean-train.csv',index=False)

In [49]:
# Load the dataset
df = pd.read_csv('dataset/tweets-dev.csv')
df['processed'] = df['Tweet'].apply(preprocess_text)
df['processed'] = df['processed'].apply(tokenize)
df['processed']=df['processed'].apply(lambda x: ' '.join(x).replace('\\n',''))
df.drop(columns=['Tweet'], inplace=True)

# Get the list of all column names
all_columns = df.columns.tolist()
# Move 'Processed' to the first position
all_columns.insert(0, all_columns.pop(all_columns.index('processed')))
# Reorder the columns
df = df.reindex(columns=all_columns)

df.to_csv('processed/clean-dev.csv',index=False)

In [50]:
# Load the dataset
df = pd.read_csv('dataset/tweets-test.csv')
df['processed'] = df['Tweet'].apply(preprocess_text)
df['processed'] = df['processed'].apply(tokenize)
df['processed'] = df['processed'].apply(lambda x: remove_words(x, words_to_remove))
df['processed']=df['processed'].apply(lambda x: ' '.join(x).replace('\\n',''))
df.drop(columns=['Tweet'], inplace=True)

# Get the list of all column names
all_columns = df.columns.tolist()
# Move 'Processed' to the first position
all_columns.insert(0, all_columns.pop(all_columns.index('processed')))
# Reorder the columns
df = df.reindex(columns=all_columns)


df.to_csv('processed/clean-test.csv',index=False)