# 1. Duplicates analysis

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../../data/tweets.csv')

Check duplicated rows

In [2]:
df.duplicated().sum()   # 36 duplicated entries on both tweet_text and cyberbullying_type

36

In [3]:
df[df.duplicated(keep=False)]       # Duplicated with equal tweet_text and equal cyberbullying_type     (DROP)

Unnamed: 0,tweet_text,cyberbullying_type
829,Our pancakes are selling like hotcakes Shaz - ...,not_cyberbullying
1712,This is the opportunity to prove ourselves lik...,not_cyberbullying
1758,Our pancakes are selling like hotcakes Shaz - ...,not_cyberbullying
1984,@TVWEEKmag: There is only 1 way to stay in the...,not_cyberbullying
2611,It wouldn't be fair. Kat knows NOTHING of fair...,not_cyberbullying
...,...,...
20604,A Pakistani court has sentenced 86 members of ...,religion
41403,"Still, Davis, who is gay, said he pays a socia...",ethnicity
46915,Racism won't stop as long as u stil select ur ...,ethnicity
46962,"Still, Davis, who is gay, said he pays a socia...",ethnicity


In [4]:
df = df.drop_duplicates()

In [5]:
# df.to_csv(r"../../data/updated_tweets.csv", index=False)

### Now, we also observe tweets that are identical in tweet text but differ in cyberbullying type

In [6]:
df.duplicated(subset=['tweet_text']).sum()

1639

In [7]:
d = df[df.duplicated(subset=['tweet_text'], keep = False)]      
len(d)

3278

At this point, we have found 1639 pairs of duplicates on tweet_text. Let's observe how they are divided

In [8]:
len(d[d['cyberbullying_type']=='religion'])

6

In [9]:
len(d[d['cyberbullying_type']=='ethnicity'])

7

In [10]:
len(d[d['cyberbullying_type']=='age'])

0

In [11]:
len(d[d['cyberbullying_type']=='gender'])

176

In [12]:
len(d[d['cyberbullying_type']=='not_cyberbullying'])

1509

In [13]:
len(d[d['cyberbullying_type']=='other_cyberbullying'])

1580

Now, we ask ourselves: how many of these pair are composed of 'other_cyberbullying' and 'not_cyberbullying' as types?

In [14]:
dataset = d[(d['cyberbullying_type'] == 'other_cyberbullying') | (d['cyberbullying_type'] == 'not_cyberbullying')]
dataset.duplicated(subset=['tweet_text']).sum()

1456

In [15]:
#d[d['tweet_text'] == "@stockputout everything but mostly my priest"]

In [16]:
#d[d['tweet_text'] == "@Jason_Gio meh. :P  thanks for the heads up, but not too concerned about another angry dude on twitter."]

# 2. Normalization

In [17]:
pip install emoji contractions

Note: you may need to restart the kernel to use updated packages.


In [18]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alessia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/alessia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alessia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
import pandas as pd
import numpy as np
import re
import emoji
import string
import contractions
from functools import reduce

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [20]:
#Opening .csv file
#df = pd.read_csv('../../data/updated_tweets.csv')

#Define stop words for text cleaning
stop_words = set(stopwords.words('english'))

stop_words.add("mkr")

lemmatizer = WordNetLemmatizer()

ps = PorterStemmer()

In [21]:
#Replace all textual emojis in the string with null character
def remove_textual_emojis(tweet):
    tweet = re.sub(r"(:[DPp3Oo\(\)])|(:'[\)\(])|(;[\)\(])|(-.-)|(^[._]^)|(x[\(\)])", '', tweet)
    return tweet #emoji.replace_emoji(tweet,'')


#Replace links and mentions (@) with null character
def remove_links_mentions(tweet):
    return re.sub(r"((?:\@|https?\:\/\/|www)\S+)|(^RT)", '', tweet)


#Remove all hashtags at the end of the sentence and remove the # symbol from all others
def remove_hashtag(tweet):
    tweet = re.sub(r"(\s+#[\w-]+)+\s*$", '', tweet)
    return re.sub(r"#([\w-]+)", r'\1', tweet)


#Remove multiple spaces (2+) and remove spaces at the beginning and end of the sentence 
def remove_spaces(tweet):
    tweet = re.sub(r"\s{2,}", ' ', tweet)
    tweet = re.sub(r"^\s", '', tweet)
    return re.sub(r"\s$", '', tweet)


#Remove not ASCII characters (it includes not-textual emojis)
def remove_not_ASCII(tweet):
    return tweet.encode("ascii", errors="ignore").decode()


#Expand contractions (e.g.: can't => cannot)
def remove_contractions(tweet):
    return contractions.fix(tweet,slang=True)


#Remove all stopwords
def remove_stopwords(tweet):
    word_tokens = word_tokenize(tweet)
    
    # checks whether they are present in stop_words or not
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
            
    return ' '.join(filtered_sentence)


#Remove punctuation symbols
def remove_punctuation(tweet):
    return remove_spaces(tweet.translate(str.maketrans("", "", string.punctuation)))


#Lemmatization
#Controllare
def lemmatization(tweet):
    return lemmatizer.lemmatize(tweet)


#Stemming
#Ok
def stemming(tweet):
    tmp_tweet = word_tokenize(tweet)
    return reduce(lambda x, y: x + ps.stem(y), tmp_tweet, "")


#Remove multiple doubles in each word of the tweet (e.g.: coooll => cooll)
def remove_elongated_words(tweet):
    return re.sub(r'(.)\1+', r'\1\1', tweet)


#Remove & and $ symbols
def remove_special_characters(tweet):
    return re.sub(r'[&$]', '',tweet)


def remove_short_tweets(tweet, min_words=3):
    words = tweet.split()
    return tweet if len(words) >= min_words else ""

In [22]:
def normalize_tweet(tweet):
    tweet = remove_links_mentions(tweet)
    tweet = tweet.lower()
    tweet = remove_hashtag(tweet)
    tweet = remove_special_characters(tweet)
 
    tweet = remove_spaces(tweet)
    tweet = remove_textual_emojis(tweet)
    tweet = remove_not_ASCII(tweet)
    tweet = remove_contractions(tweet)
    tweet = remove_stopwords(tweet)
    tweet = remove_punctuation(tweet)
    tweet = remove_elongated_words(tweet)
    
    tweet = lemmatization(tweet)
    #tweet = stemming(tweet)

    tweet = remove_short_tweets(tweet, min_words=3)
    return tweet

In [23]:
#Apply normalization to all tweets
df.tweet_text = [normalize_tweet(tweet) for tweet in df.tweet_text]

In [24]:
#Delete empty tweets after the normalization
df=df.drop(df[(df.tweet_text == r'')].index)  # Rimosse 273 entries

In [25]:
#Export normalizated tweets
df.to_csv(r"../../data/normalized_tweets.csv", index=False)