# Normalization

In [46]:
import pandas as pd
import numpy as np
import re
import emoji
import string
import contractions
from functools import reduce

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

In [47]:
#Define stop words for text cleaning
stop_words = set(stopwords.words('english'))

stop_words.add("mkr")

lemmatizer = WordNetLemmatizer()

ps = PorterStemmer()


In [48]:
#Replace all textual emojis in the string with null character
def remove_textual_emojis(tweet): 
    tweet = re.sub(r"(:[DPp3Oo\(\)])|(:'[\)\(])|(;[\)\(])|(-.-)|(^[._]^)|(x[\(\)])", '', tweet)
    return tweet #emoji.replace_emoji(tweet,'')


#Replace links and mentions (@) with null character
def remove_links_mentions(tweet): 
    return re.sub(r"((?:\@|https?\:\/\/|www)\S+)|(^RT)", '', tweet)


#Remove all hashtags at the end of the sentence and remove the # symbol from all others
def remove_hashtag(tweet):
    tweet = re.sub(r"(\s+#[\w-]+)+\s*$", '', tweet)
    return re.sub(r"#([\w-]+)", r'\1', tweet)


#Remove multiple spaces (2+) and remove spaces at the beginning and end of the sentence 
def remove_spaces(tweet):
    tweet = re.sub(r"\s{2,}", ' ', tweet)
    tweet = re.sub(r"^\s", '', tweet)
    return re.sub(r"\s$", '', tweet)


#Remove not ASCII characters (it includes not-textual emojis)
def remove_not_ASCII(tweet):
    return tweet.encode("ascii", errors="ignore").decode()


#Expand contractions (e.g.: can't => cannot)
def remove_contractions(tweet):
    return contractions.fix(tweet,slang=True)


#Remove all stopwords
def remove_stopwords(tweet):
    word_tokens = word_tokenize(tweet)
    
    # checks whether they are present in stop_words or not
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
            
    return ' '.join(filtered_sentence)


#Remove punctuation symbols
def remove_punctuation(tweet):
    return remove_spaces(tweet.translate(str.maketrans("", "", string.punctuation)))


#Lemmatization
#Controllare
def lemmatization(tweet):
    return lemmatizer.lemmatize(tweet)


#Stemming
#Ok
def stemming(tweet):
    tmp_tweet = word_tokenize(tweet)
    return reduce(lambda x, y: x + ps.stem(y), tmp_tweet, "")


#Remove multiple doubles in each word of the tweet (e.g.: coooll => cooll)
def remove_elongated_words(tweet):
    return re.sub(r'(.)\1+', r'\1\1', tweet)


#Remove & and $ symbols
def remove_special_characters(tweet):
    return re.sub(r'[&$]|amp', '',tweet)


def remove_short_tweets(tweet, min_words=3):
    words = tweet.split()
    return tweet if len(words) >= min_words else ""

In [49]:
def lemmatization_with_pos(sentence):
    wnl = WordNetLemmatizer()
    tagged_words = pos_tag(word_tokenize(sentence))
    lemmatized_words = []
    
    for word, tag in tagged_words:
        wntag = tag[0].lower()  # Get the first character of the tag
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None  # Map POS tags to WordNet POS tags
        if not wntag:
            lemma = word  # If the POS tag is not recognized, use the word as is
        else:
            lemma = wnl.lemmatize(word, pos=wntag)  # Lemmatize the word with its POS tag
        lemmatized_words.append(lemma)
    
    return ' '.join(lemmatized_words)

In [50]:
def clean_normalized_df(df):
    #Delete empty tweets after the normalization
    df=df.drop(df[(df.tweet_text == r'')].index)  # Rimosse 273 entries
    
    #Check duplicates after the normalization
    df = df.drop_duplicates()

    return df

In [51]:
def Bayes_normalization(tweet):
    tweet = remove_links_mentions(tweet)
    tweet = tweet.lower()
    tweet = remove_hashtag(tweet)
    tweet = remove_special_characters(tweet)
 
    tweet = remove_spaces(tweet)
    tweet = remove_textual_emojis(tweet)
    tweet = remove_not_ASCII(tweet)
    tweet = remove_contractions(tweet)
    tweet = remove_stopwords(tweet)
    tweet = remove_punctuation(tweet)
    tweet = remove_elongated_words(tweet)
    
    tweet = lemmatization_with_pos(tweet)
    #tweet = stemming(tweet)

    tweet = remove_short_tweets(tweet, min_words=3)
    return tweet

def Bayes_preprocessing(df):
    df['tweet_text'] = df['tweet_text'].apply(Bayes_normalization)
    df = clean_normalized_df(df)
    return df

In [52]:
def Transformers_normalization(tweet):
    tweet = remove_links_mentions(tweet)
    tweet = tweet.lower()
    tweet = remove_hashtag(tweet)
    tweet = remove_special_characters(tweet)
 
    tweet = remove_spaces(tweet)
    tweet = remove_textual_emojis(tweet)
    tweet = remove_not_ASCII(tweet)
    return tweet

def Transformers_preprocessing(df):
    df['tweet_text'] = df['tweet_text'].apply(Transformers_normalization)
    df = clean_normalized_df(df)
    return df

In [53]:
def LSTM_normalization(tweet):
    #Ciao Chiara, aggiungi quello che ti serve
    return tweet

def LSTM_preprocessing(df):
    df['tweet_text'] = df['tweet_text'].apply(LSTM_normalization)
    df = clean_normalized_df(df)
    return df

In [54]:
pool = {"../../data/train_tweets.csv", "../../data/eval_tweets.csv", "../../data/test_tweets.csv"}


In [55]:
for file in pool:
    df = pd.read_csv(file)
    df = Bayes_preprocessing(df)
    file = file.split(".csv")[0]
    df.to_csv(file+"_Naive_Bayes.csv", index=False)
    




In [56]:
for file in pool:
    df = pd.read_csv(file)
    df = LSTM_preprocessing(df)
    file = file.split(".csv")[0]
    df.to_csv(file+"_LSTM.csv", index=False)

In [57]:
for file in pool:
    df = pd.read_csv(file)
    df = Transformers_preprocessing(df)
    file = file.split(".csv")[0]
    df.to_csv(file+"_Transformers.csv", index=False)