# 2. Normalization

In [None]:
import pandas as pd
import numpy as np
import re
import emoji
import string
import contractions
from functools import reduce

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [None]:
#Opening .csv file
df = pd.read_csv('../../data/updated_tweets.csv')

#Define stop words for text cleaning
stop_words = set(stopwords.words('english'))

stop_words.add("mkr")

lemmatizer = WordNetLemmatizer()

ps = PorterStemmer()

In [None]:
#Replace all textual emojis in the string with null character
def remove_textual_emojis(tweet):
    tweet = re.sub(r"(:[DPp3Oo\(\)])|(:'[\)\(])|(;[\)\(])|(-.-)|(^[._]^)|(x[\(\)])", '', tweet)
    return tweet #emoji.replace_emoji(tweet,'')


#Replace links and mentions (@) with null character
def remove_links_mentions(tweet):
    return re.sub(r"((?:\@|https?\:\/\/|www)\S+)|(^RT)", '', tweet)


#Remove all hashtags at the end of the sentence and remove the # symbol from all others
def remove_hashtag(tweet):
    tweet = re.sub(r"(\s+#[\w-]+)+\s*$", '', tweet)
    return re.sub(r"#([\w-]+)", r'\1', tweet)


#Remove multiple spaces (2+) and remove spaces at the beginning and end of the sentence 
def remove_spaces(tweet):
    tweet = re.sub(r"\s{2,}", ' ', tweet)
    tweet = re.sub(r"^\s", '', tweet)
    return re.sub(r"\s$", '', tweet)


#Remove not ASCII characters (it includes not-textual emojis)
def remove_not_ASCII(tweet):
    return tweet.encode("ascii", errors="ignore").decode()


#Expand contractions (e.g.: can't => cannot)
def remove_contractions(tweet):
    return contractions.fix(tweet,slang=True)


#Remove all stopwords
def remove_stopwords(tweet):
    word_tokens = word_tokenize(tweet)
    
    # checks whether they are present in stop_words or not
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
            
    return ' '.join(filtered_sentence)


#Remove punctuation symbols
def remove_punctuation(tweet):
    return remove_spaces(tweet.translate(str.maketrans("", "", string.punctuation)))


#Lemmatization
#Controllare
def lemmatization(tweet):
    return lemmatizer.lemmatize(tweet)


#Stemming
#Ok
def stemming(tweet):
    tmp_tweet = word_tokenize(tweet)
    return reduce(lambda x, y: x + ps.stem(y), tmp_tweet, "")


#Remove multiple doubles in each word of the tweet (e.g.: coooll => cooll)
def remove_elongated_words(tweet):
    return re.sub(r'(.)\1+', r'\1\1', tweet)


#Remove & and $ symbols
def remove_special_characters(tweet):
    return re.sub(r'[&$]', '',tweet)

In [None]:
def normalize_tweet(tweet):
    tweet = remove_links_mentions(tweet)
    tweet = tweet.lower()
    tweet = remove_hashtag(tweet)
    tweet = remove_special_characters(tweet)
 
    tweet = remove_spaces(tweet)
    tweet = remove_textual_emojis(tweet)
    tweet = remove_not_ASCII(tweet)
    tweet = remove_contractions(tweet)
    tweet = remove_stopwords(tweet)
    tweet = remove_punctuation(tweet)
    tweet = remove_elongated_words(tweet)
    
    tweet = lemmatization(tweet)
    #tweet = stemming(tweet)
    
    return tweet

In [None]:
#Apply normalization to all tweets
df.tweet_text = [normalize_tweet(tweet) for tweet in df.tweet_text]

In [None]:
#Delete empty tweets after the normalization
df=df.drop(df[(df.tweet_text == r'')].index)  # Rimosse 273 entries

In [None]:
#Export normalizated tweets
df.to_csv(r"../../data/normalized_tweets.csv", index=False)