In [1]:
import pandas as pd
import csv
import nltk
import re
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
from tokenizer import *
from nltk.corpus import stopwords

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

import wordninja

from nltk.stem import SnowballStemmer

import ast

from bs4 import BeautifulSoup

import emoji
import unicodedata

import gzip

import spacy_udpipe
import spacy
import language_tool_python

import sys
import os
sys.path.append('..')

In [2]:
pd.set_option("display.max_colwidth", None)

In [3]:
#Udpipe
spacy_udpipe.download("it")
nlp = spacy_udpipe.load("it")

Already downloaded a model for the 'it' language


In [4]:
# names of files to read from
train_val_AB_TSV = '../../../SaRaH/dataset/haspeede2/raw/haspeede2_dev/haspeede2_dev_taskAB.tsv'

italian_words = '../../../SaRaH/dataset/words/parole_uniche.txt'
italian_gzip = '../../../SaRaH/dataset/words/italian_words.txt.gz'
bad_words = '../../../SaRaH/dataset/words/lista_badwords.txt'

test_tweets_AB_TSV = '../../../SaRaH/dataset/haspeede2/raw/haspeede2_test/haspeede2_test_taskAB-tweets.tsv'
test_news_AB_TSV = '../../../SaRaH/dataset/haspeede2/raw/haspeede2_test/haspeede2-test_taskAB-news.tsv'

reference_tweets_AB_TSV = '../../../SaRaH/dataset/haspeede2/raw/haspeede2_reference/haspeede2_reference_taskAB-tweets.tsv'
reference_news_AB_TSV = '../../../SaRaH/dataset/haspeede2/raw/haspeede2_reference/haspeede2_reference_taskAB-news.tsv'

In [5]:
#Wordninja
lm = wordninja.LanguageModel(italian_gzip)

In [6]:
#Italian dictionary
f1 = open(italian_words, 'r', encoding='utf8')

italian_dict = [] #list of lowercase words

for x in f1:
    y = x.rstrip()
    y = y.lower()
    if y != '':
        italian_dict.append(y)

In [7]:
#Bad Words
f2 = open(bad_words, 'r', encoding='utf8')

bad_words_dict = [] #list of lowercase words

for x in f2:
    y = x.rstrip()
    y = y.lower()
    if y != '':
        bad_words_dict.append(y)

In [8]:
#Dataset
df = pd.read_csv(train_val_AB_TSV, sep='\t')
df1 = pd.read_csv(train_val_AB_TSV, sep='\t')

In [9]:
df.rename(columns={"text ": "text"}, inplace=True) #the text column is identified by 'text ' (with a space at the end), change
df1.rename(columns={"text ": "text"}, inplace=True)

In [10]:
df

Unnamed: 0,id,text,hs,stereotype
0,2066,"È terrorismo anche questo, per mettere in uno stato di soggezione le persone e renderle innocue, mentre qualcuno... URL",0,0
1,2045,@user @user infatti finché ci hanno guadagnato con i campi #rom tutto era ok con #Alemanno #Ipocriti,0,0
2,61,"Corriere: Tangenti, Mafia Capitale dimenticataMazzette su buche e campi rom URL #roma",0,0
3,1259,"@user ad uno ad uno, perché quando i migranti israeliti arrivarono in terra di Canaan fecero fuori tutti i Canaaniti.",0,0
4,949,Il divertimento del giorno? Trovare i patrioti italiani che inneggiano contro i rom facendo la spesa alla #Lidl (multinazionale tedesca).,0,0
...,...,...,...,...
6832,9340,"Gli stati nazionali devono essere pronti a rinunciare alla propria sovranità. Lo ha detto la Merkel , che ha aggiunto che gli stati nazionali non devono ascoltare la volontà dei loro cittadini quando si tratta di questioni che riguardano immigrazione, confini, o persino sovranità URL",0,0
6833,9121,"Il ministro dell'interno della Germania #HorstSeehofer,sta facendo la proposta di dare soldi agli immigrati che vogliono tornare a casa e aiutarli a creare un'attività a casa loro e fare business con la Germania.Chi paga?Una parte i crucchi e il resto l'Europa, cioè io e voi!",0,0
6834,8549,"#Salvini: In Italia troppi si sono montati la testa, io ringrazio Dio e voi per questi mesi straordinari. Vi raccontavano che su immigrazione non si poteva fare nulla, è bastato usare buonsenso e coraggio. #iocisono #piazzadelpopolo",0,0
6835,9240,@user @user Chi giubila in buona fede non ha capito niente. Purtroppo credo che i più non siano in buona fede. I migranti sono un grosso business e chi finora li ha voluti non vuole perdere questo guadagno,0,0


<h3> Preprocessing

<h3> Removing URLs

In [11]:
def clean_url(text):
    return re.sub(r'URL', ' ', text)

In [12]:
df['text'] = df['text'].apply(clean_url)
df1['text'] = df['text'].apply(clean_url)

<h3> Removing Tags

In [13]:
def clean_tag(text):
    return re.sub(r'@user', ' ', text)

In [14]:
df['text'] = df['text'].apply(clean_tag)
df1['text'] = df['text'].apply(clean_tag)

<h3> Feature extraction: length of the comment

In [15]:
def text_length(text):
    return len(text)

In [16]:
df['text_length'] = df['text'].apply(text_length)

<h3> Normalizing emoticons

In [17]:
#https://github.com/cbaziotis/ekphrasis

text_processor = TextPreProcessor(
    fix_html=True,  # fix HTML tokens
    
    #unpack_hashtags=True,  # perform word segmentation on hashtags
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=False).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading english - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [18]:
def normalize_emoticon(text):
    return " ".join(text_processor.pre_process_doc(text))

In [19]:
df['text'] = df['text'].apply(normalize_emoticon)

<h3> Converting ':' into 'double_dots'

In [20]:
def convert_double_dots(text):
    return re.sub(r':', ' double_dots ', text)

In [21]:
df['text'] = df['text'].apply(convert_double_dots)

<h3> Translation of emoji

In [22]:
def translate_emoticon(text):
    text_result = emoji.demojize(text, language='it')
    return text_result

In [23]:
df['text'] = df['text'].apply(translate_emoticon)

<h3> Removing : (especially from emoji translation)

In [24]:
def clean_two_dots(text):
    return re.sub(r':', ' ', text)

In [25]:
df['text'] = df['text'].apply(clean_two_dots)

<h3> Converting 'double_dots' into ':'

In [26]:
def reconvert_double_dots(text):
    return re.sub(r'double_dots', ' : ', text)

In [27]:
df['text'] = df['text'].apply(reconvert_double_dots)

<h3> Add space before #

In [28]:
def add_space_before_hashtag(text): ##io#vado -> # #io #vado
    words = text.split()
    newwords = []
    for word in words:
        for i in range(0, len(word)):
            if i != 0:
                if word[i] == '#':
                    word = add_space_before_hashtag(word[:i]) + ' ' + add_space_before_hashtag(word[i:])
        newwords.append(word)
    return ' '.join(newwords)

In [29]:
df['text'] = df['text'].apply(add_space_before_hashtag)

<h3> Feature extraction: number of hashtags

In [30]:
def find_hashtags(text):
    result = re.findall(r'#\S+', text)
    return len(result)

In [31]:
df['hashtags'] = df['text'].apply(find_hashtags)

In [32]:
def normalize_hashtags(text):
    return re.sub(r'#\S+', '#hashtag', text)

In [33]:
df1['text'] = df1['text'].apply(normalize_hashtags)

<h3> Replacing hashtags

In [34]:
hashtag_words = ['lidl', 'roma', 'caritas', 'syria', 'isis', 'm5s', 'apatridi', 'brexit', 'sinta', 'msna', 'yacme',
                 'ckan', 'dimartedi', 'karak', 'cojoni', 'uae', 'scampia', 'onsci', 'hamas', 'ncd', 'olbes', 'fdian',
                 'acquarius', 'aquarius', 'macron', 'barbarians', 'kyenge', 'kienge', 'mef', 'muslim', 'error', 'soros',
                 'italexit', 'sprar', 'ahvaz', 'nsa', 'enez', 'daspo', 'cpr', 'desire', 'boldrina', 'msf', 'belgium',
                 'piddino', 'piddina', 'fdi', 'zarzis', 'eliminiamolo', 'strasbourg', 'isee', 'sophia', 'unit', 'oeshh',
                 'porrajmos', 'dibba', 'ciociaria', 'cie', 'junker', 'is', 'syriza', 'linate', 'raqqa', 'ama', 'cesedi',
                 'aicds', 'heidelberg', 'ffoo', 'cvd', 'forex', 'docufilm', 'reyn', 'hooligans', 'anpal', 'rdc', 'rohingya',
                 'nwo', 'def', 'cattivisti', 'vauro', 'sorosiane', 'libya', 'censis']

In [35]:
def add_space(text):
    words = text.split()
    newwords = []
    for word in words:
        for i in range(0, len(word)):
            if i != len(word)-1 and word[i] != ' ':
                if word[i].islower() and word[i+1].isupper():
                    word = word[:i+1] + ' ' + word[i+1:]
        newwords.append(word)
    return ' '.join(newwords)

In [36]:
def replace_hashtags(text):
    
    text = ' ' + text + ' '
    result = re.findall(r'#\S+', text)
    
    for word in result:
        new_word = '< '
        if word[1:].lower() not in hashtag_words:
            spaced_word = add_space(word)
            splitted = lm.split(spaced_word)
            
            for i in range(0, len(splitted)):
                if i == 0:
                    new_word = new_word + splitted[i]
                else:
                    new_word = new_word + ' ' + splitted[i]
        else:
            new_word = new_word + word[1:]
        new_word = new_word + ' >'
        
        text = text.replace(word, new_word)
        
    return text

In [37]:
df['text'] = df['text'].apply(replace_hashtags)

<h3> Fixing Hashtags

In [38]:
fixed_hashtags = {'je sui s Charlie':'je suis Charlie', 'libere t cogita n s':'libere t cogitans', 'm a g g i o':'maggio', 
                  'ha te speech':'hate speech', 'w ed din g tour i sm':'wedding tourism', 'in comin g':'incoming',
                  'dimarte d':'dimarted', 'dall a vostra':'dalla vostra', 'Trattati Rom a':'Trattati Roma',
                  'I u venta':'Iuventa', 'Woolf e':'Woolfe', 'Attuali t':'Attualit', 'Morni ng':'Morning',
                  'Fort Lau derda l e':'Fort Lauderdale', 'mi gran ts':'migrants', 'a p r i l e':'aprile',
                  'E n e r g y':'Energy', 'I g les':'Igles', 'Christ mas':'Christmas', 'Sud Tiro l':'Sud Tirol',
                  'Paler m':'Palerm', 'esma red ze po va':'esma redzepova', 'gip sy':'gipsy', 'auster it':'austerit',
                  'immigrati on ban':'immigration ban', 'Financial Time s':'Financial Times', 'metro rom a':'metro roma',
                  'Su ed deutsch e Ze i tung':'Sueddeutsche Zeitung', 'porta aporta':'porta a porta', 'terro r':'terror',
                  'immi gran ts':'immigrants', 'giornata dell a memoria':'giornata della memoria', 'm a r z o':'marzo',
                  'dusse l dor f':'dusseldorf', 'riscopriamo l i':'riscopriamoli', 'ultimo ra':'ultima ora',
                  'Cercate l i':'Cercateli', 'Islam op ho bia':'Islamophobia', 'd i c e m b r e':'dicembre',
                  'g e n n a i o':'gennaio', 'f e b b r a i o':'febbraio', 'g i u g n o':'giugno', 'l u g l i o':'luglio',
                  'a g o s t o':'agosto', 's e t t e m b r e':'settembre', 'o t t o b r e':'ottobre',
                  'n o v e m b r e':'novembre', 'ta gad al a 7':'tagada la 7', 'Coffe e break l a 7':'Coffee break la 7',
                  'A S Rom a':'Associazione Sportiva Roma', 'Stadio Dell a Rom a':'Stadio Della Roma',
                  'best songo f movie':'best song of movie', 'un avita in metro':'una vita in metro', 'l e iene':'le iene',
                  'l a zanzara':'la zanzara', 'charlie h e b do':'charlie hebdo', 'Loo king':'Looking',
                  'Rom a Non Alza Muri':'Roma Non Alza Muri', 'r e fu gee s':'refugees', 'non una dime mo':'non una di meno',
                  'i t a l i a':'italia', 'r ss':'rss', 'at tack':'attack', 'a t t u a l i t a':'attualità',
                  'no tengo din ero':'no tengo dinero', 'Hi jr i':'Hijri', 'Asl i Erdogan':'Asli Erdogan',
                  'i si s fu c k you':'isis fuck you', 'L a Ru spetta':'La Ruspetta', 'Go o g l e Aler ts':'Google Alerts',
                  'A mi s':'Amis', 'Bal on Mundial':'Balon Mundial', 'l i dl':'lidl', 'racis m':'racism ',
                  'C y ber':'Cyber', 'Sili con Valle y':'Silicon Valley', 'Medio Campi dan':'Medio Campidan',
                  'w e l comere fu gee s':'welcome refugees', 'Ta gada':'Tagada', 'Grande sy n the':'Grande synthe',
                  'contentivo i':'contenti voi', 'at tack s':'attacks', 'REY N':'REYN', 'Marine L e Pen':'Marine Le Pen',
                  't ru e story':'true story', 'brex it':'brexit', 'delledonne':'delle donne', 'fu c kis i s':'fuck isis',
                  'islam i s the pro ble m':'islam is the problem', 'l a gabbia':'la gabbia', 'fu c k islam':'fuck islam',
                  'fu c k':'fuck', 'fu c k musli ms':'fuck muslims', 'sapeva telo':'sapevatelo', 'R in y':'Riny',
                  'A f g han Con f':'Afghan Con f', 'Ch al i e H e b do':'Chelie Hebdo', 's k y':'sky', 'H e b do':'Hebdo',
                  'S ho ot in g':'Shooting', 'Islam i c State':'Islamic State', 'l a 7':'la 7', 'Daisy Os akue':'Daisy Osakue',
                  'laria che tira':'l aria che tira', 'i phon e X S max':'iphone XSmax', 'L e Ga':'lega',
                  'laria che tirala':'l aria che tira la', 'Casa po un d':'Casa pound', 'R M C NEWS':'RMC NEWS',
                  'm i l i o n i':'milioni', 'un altro cucchia in odi merda':'un altro cucchiaino di merda',
                  'omnibus l a 7':'omnibus la 7', 'job sa c t':'jobs act', 'Mi grati on':'Migration',
                  'Movi men t Onesti':'Moviment Onesti', 'none larena':'non è l arena' ,'Non Un ad i Meno':'Non Una di Meno',
                  'Fil c ams Collettiva':'Filcams Collettiva', 'Time s':'Times', 'ci vi l t allo sbando':'civiltà allo sbando',
                  'Am ne st y International':'Amnesty International', 'C H I U D E T E':'CHIUDETE', 'Open Arm s':'Open Arms',
                  'Gilet s J a un e s':'Gilets Jaunes', 'Mi grant I':'MigrantI', 'Horst Se e hofer':'Horst Seehofer',
                  '5s':'cinque stelle', 'rd c':'rdc', 'piazza delpopolo':'piazza del popolo'}

In [39]:
def hashtag_fix(text):
    for word in fixed_hashtags:
        #text = text.replace(word, fixed_hashtags[word])
        text = re.sub(re.escape(word), fixed_hashtags[word], text, flags=re.IGNORECASE)
    return text

In [40]:
df['text'] = df['text'].apply(hashtag_fix)

<h3> Normalizing Numbers

In [41]:
def normalize_numbers(text):
    try:
        val = int(text)
    except:
        text = re.sub('\d', '@Dg', text)
        return text
    if val >= 0 and val < 2100:
        return str(val)
    else:
        return "DIGLEN_" + str(len(str(val)))

In [42]:
df['text'] = df['text'].apply(normalize_numbers)
df1['text'] = df['text'].apply(normalize_numbers)

<h3> Removing _, \\n, \\ and /

In [43]:
def clean_some_punctuation(text):
    text = ' ' + text + ' '
    text = re.sub(r'\\n', '. ', text)
    text = re.sub(r'\\', ' ', text)
    text = re.sub(r'/', ' ', text)
    return re.sub(r'_', ' ', text) 

In [44]:
df['text'] = df['text'].apply(normalize_numbers)
df1['text'] = df['text'].apply(normalize_numbers)

<h3> Add space between lowercase and uppercase

In [45]:
def add_space(text):
    words = text.split()
    newwords = []
    for word in words:
        for i in range(0, len(word)):
            if i != len(word)-1 and word[i] != ' ':
                if word[i].islower() and word[i+1].isupper():
                    word = word[:i+1] + ' ' + word[i+1:]
        newwords.append(word)
    return ' '.join(newwords)

In [46]:
df['text'] = df['text'].apply(add_space)
df1['text'] = df1['text'].apply(add_space)

<h3> Convert all emoticons written in text

In [47]:
emoticons_text = {
    '<kiss>': 'bacio',
    '<happy>': 'felice',
    '<laugh>': 'risata',
    '<sad>': 'triste',
    '<surprise>': 'sorpreso',
    '<wink>': 'occhiolino',
    '<tong>': 'faccia con lingua',
    '<annoyed>': 'annoiato',
    '<seallips>': 'labbra sigillate',
    '<angel>': 'angelo',
    '<devil>': 'diavolo',
    '<highfive>' : 'batti il cinque',
    '<heart>': 'cuore',
    '<user>' : 'persona',
}

In [48]:
def clean_emoticon_text(text):
    text_words = text.split()
    new_words  = [emoticons_text.get(ele, ele) for ele in text_words]
    return ' '.join(new_words)

In [49]:
df['text'] = df['text'].apply(clean_emoticon_text)

<h3> Feature extraction: percentage of words written in CAPS-LOCK

In [50]:
def caps_lock_words(text):
    words = text.split()
    count_caps_lock = 0
    number_of_words = len(words)
    
    for word in words:
        if word.isupper() == True:
            count_caps_lock = count_caps_lock + 1
            
    return ((count_caps_lock*100)//number_of_words)

In [51]:
df['%CAPS-LOCK words'] = df['text'].apply(caps_lock_words)

<h3> Normalizing Words

In [52]:
def normalize_text(text):
    words = text.split()
    result_words = []
    
    for word in words:
        if len(word) > 26:
            return "__LONG-LONG__"
        new_word = normalize_numbers(word)
        if new_word != word:
            word = new_word
        if word[0].isupper():
            word = word.capitalize()
        else:
            word = word.lower()
        result_words.append(word)
        
    return ' '.join(result_words)

In [53]:
df['text'] = df['text'].apply(normalize_text)
df1['text'] = df['text'].apply(normalize_text)

<h3> Feature extraction: number of ‘!’ inside the comment

In [54]:
def esclamations(text):
    return text.count('!')

In [55]:
df['esclamations'] = df['text'].apply(esclamations)

<h3> Feature extraction: number of ‘?’ inside the comment

In [56]:
def questions(text):
    return text.count('?')

In [57]:
df['questions'] = df['text'].apply(questions)

<h3> Cleaning Censured Bad Words

In [58]:
def clean_censured_bad_words(text):
    text = " " + text + " "
    text = re.sub(r' c[.x*@%#$^]+i ', ' coglioni ', text)
    text = re.sub(r' c[.x*@%#$^]+e ', ' coglione ', text)
    text = re.sub(r' c[.x*@%#$^]+o ', ' cazzo ', text) 
    text = re.sub(r' c[.x*@%#$^]+i ', ' cazzi ', text) 
    text = re.sub(r' m[.x*@%#$^]+a ', ' merda ', text) 
    text = re.sub(r' m[.x*@%#$^]+e ', ' merde ', text) 
    text = re.sub(r' c[.x*@%#$^]+ulo ', ' culo ', text) 
    text = re.sub(r' p[.x*@%#$^]+a ', ' puttana ', text)
    text = re.sub(r' p[.x*@%#$^]+e ', ' puttane ', text)
    text = re.sub(r' t[.x*@%#$^]+a ', ' troia ', text)
    text = re.sub(r' t[.x*@%#$^]+e ', ' troie ', text)
    text = re.sub(r' s[.x*@%#$^]+o ', ' stronzo ', text)
    text = re.sub(r' s[.x*@%#$^]+i ', ' stronzi ', text)
    return text

In [59]:
df['text'] = df['text'].apply(clean_censured_bad_words)
df1['text'] = df1['text'].apply(clean_censured_bad_words)

<h3> Removing #

In [60]:
def clean_hashtag_symbol(text):
    text = ' ' + text + ' '
    return re.sub(r'#', ' ', text)

In [61]:
df['text'] = df['text'].apply(clean_hashtag_symbol)

<h3> Removing laughs

In [62]:
laughs = ['ah', 'eh', 'he' 'ih', 'hi'] #non elimina ahahahah, ma solo ah
vowels = ['a', 'e', 'i', 'o', 'u']

def clean_laughs(text):
    #s = "ahahahah ho fame io, eh eh" -> " ho fame io,"
    text_words = text.split()
    new_words  = [word for word in text_words if word.lower() not in laughs]
    
    new_text = ' '.join(new_words)
    
    for i in new_words:
        j = i.lower()
        for k in vowels:
            if ('h' in j) and (len(j) >= 4):
                if (len(j) - 2) <= (j.count(k) + j.count('h')):
                    new_text = new_text.replace(i, '')
    
    return new_text

In [63]:
df['text'] = df['text'].apply(clean_laughs)
df1['text'] = df1['text'].apply(clean_laughs)

<h3> Removing nearby equal vowels

In [64]:
correct_words_vowels = ['coop', 'facebook', 'canaan', 'canaaniti', 'tweet', 'voodoo', 'book', 'isee', 'speech', 'woolfe',
                        'coffee', 'ffoo', 'refugees', 'google', 'shooting', 'hooligans', 'desiree', 'retweeted', 'microaree',
                        'keep']

In [65]:
vowels = ['a', 'e', 'i', 'o', 'u']

def clean_vowels(text):
    new_text = text
    words = text.split()
    
    for word in words:
        if word.lower() not in italian_dict and word.lower() not in correct_words_vowels:
        #if word.lower() not in correct_words_vowels:
            new_string = word[0]
            for i in range(1, len(word)):
                if word[i].lower() not in vowels:
                    new_string = new_string + word[i]
                else:
                    if(word[i].lower() != word[i-1].lower()):
                        new_string = new_string + word[i] 

            new_text = new_text.replace(word, new_string)
        
    return new_text

In [66]:
df['text'] = df['text'].apply(clean_vowels)
df1['text'] = df1['text'].apply(clean_vowels)

<h3> Removing nearby equal consonants if they are more than 2

In [67]:
consonants = ['b','c','d','f','g','h','k','l','m','n','p','q','r','s','t','v','x','y','z']

def clean_consonants(text):
    new_text = text
    words = text.split()
    
    for word in words:
        if len(word) > 2: #nn non viene cambiato
            new_string = word[0]
            for i in range(1, len(word)):
                if word[i].lower() not in consonants:
                    new_string = new_string + word[i]
                else:
                    if(word[i].lower() != word[i-1].lower()):
                        new_string = new_string + word[i]
                    elif i>=2 and (word[i].lower() != word[i-2].lower()):
                        new_string = new_string + word[i]

            new_text = new_text.replace(word, new_string)
        
    return new_text

In [68]:
df['text'] = df['text'].apply(clean_consonants)
df1['text'] = df1['text'].apply(clean_consonants)

<h3> Sticking the apostrophe (text)

In [69]:
def stick_apostrophe_text(text):
    text = re.sub(r" ’", "’", text)
    return re.sub(r" '", "'", text)

In [70]:
df['text'] = df['text'].apply(stick_apostrophe_text)
df1['text'] = df1['text'].apply(stick_apostrophe_text)

<h3> Feature extraction: lemma

In [71]:
def lemma(text):
    lemmas = []
    
    doc = nlp(text)
    
    for token in doc:
        lemmas.append(token.lemma_)
        
    return lemmas

In [72]:
df['lemma'] = df['text'].apply(lemma)

<h3> Feature extraction: PoS 

<p> Words can be grouped into classes called Part of Speech (PoS) or morphological classes. 
<p> Traditional grammar provides for a few types of PoS (noun, verb, adjective, preposition, adverb, conjunction, etc.). 
<p> La PoS di una parola fornisce informazione fondamentale per determinare il ruolo della parola stessa e di quelle vicine nella frase.
<p> To see what PoS tag means, we can use spacy.explain().

In [73]:
def pos(text):
    pos_list = []
    
    doc = nlp(text)
    
    for token in doc:
        pos_list.append(token.pos_)
        
    return pos_list

In [74]:
df['pos'] = df1['text'].apply(pos)

<h3> Feature extraction: Dep

To see what PoS tag means, we can use spacy.explain().

In [75]:
def dep(text):
    dep_list = []
    
    doc = nlp(text)
    
    for token in doc:
        dep_list.append(token.dep_)
        
    return dep_list

In [76]:
df['dep'] = df1['text'].apply(dep)

<h3> Feature extraction: Word Polarity

The Italian Lexicon of Sentiments was developed semi-automatically by ItalWordNet v.2 from a list of 1,000 manually checked keywords. It contains 24,293 lexical entries annotated with positive/negative/neutral polarity. It is distributed in LMF format.

<ul>
    <li> Tag 'LexicalEntry' with attribute 'id' from 'id_0' to 'id_25097'
    <li> Tag 'Lemma' with attribute 'writtenForm' containing the lemma of the word (example: 'di_cassetta')
    <li> Tag 'Sentiment' with attribute 'polarity' ('negative'/'neutral'/'positive')
</ul>

In [77]:
# Reading the data inside the xml 
# file to a variable under the name  
# data 
with open('it-sentiment_lexicon.lmf.xml', 'r') as f: 
    data = f.read() 

# Passing the stored data inside 
# the beautifulsoup parser, storing 
# the returned object  
Bs_data = BeautifulSoup(data, "xml") 

word_polarity = {}

lemma_unique = Bs_data.find_all('Lemma')         #Finding all instances of tag 'Lemma'
sentiment_unique = Bs_data.find_all('Sentiment') 

if len(lemma_unique) != len(sentiment_unique):
    print('ERRORE')

for i in range(0, len(lemma_unique)):
    word = lemma_unique[i].get('writtenForm') #Extracting the data stored in a specific attributes of the 'Lemma' tag
    word = re.sub(r'_', ' ', word)
    
    polarity = sentiment_unique[i].get('polarity')
    
    word_polarity[word] = polarity

In [78]:
def get_word_polarity(lemmas):
    
    polarity = []
    
    for word in lemmas:
        if word in word_polarity:
            polarity.append(word_polarity[word])
        else:
            polarity.append('neutral')
            
    return polarity

In [79]:
df['word_polarity'] = df['lemma'].apply(get_word_polarity)

<h3> Tokenization

In [80]:
def tokenization(text):
    tknzr=SocialTokenizer(lowercase=False)
    return tknzr.tokenize(text)

In [81]:
df['tokens'] = df['text'].apply(tokenization)

<h3> Sticking the apostrophe

In [82]:
pre_char = ['l', 'un', 'dell', 'all', 'dall', 'nell', 'sull', 'c', 'n']
apostrophes = ["'", "’"]

In [83]:
def stick_apostrophe(tokens):
    to_pop = []
    for i in range(0, len(tokens)-1):
        if tokens[i] in pre_char and tokens[i+1] in apostrophes:
            tokens[i] = tokens[i] + "'"
            to_pop.append(i+1)
    
    result_tokens = []
    for i in range(0, len(tokens)):
        if i not in to_pop:
            result_tokens.append(tokens[i])  
        
    return result_tokens

In [84]:
df['tokens'] = df['tokens'].apply(stick_apostrophe)

<h3> Replacement of the abbreviations with the respective words

In [85]:
abbr_word = {'cmq':'comunque', 'gov':'governatori', 'fb':'facebook', 'tw':'twitter', 'juve':'juventus', 'ing':'ingegnere', 
             'sx':'sinistra', 'qdo':'quando', 'rep':'repubblica', 'grz':'grazie', 'ita':'italia', 'mln':'milioni', 
             'mld':'miliardi', 'pke':'perche', 'anke':'anche', 'cm':'come', 'dlla':'della', 'dlle':'delle', 'qst':'questa',
             'ke':'che', 'nn':'non', 'sn':'sono', 'cn':'con', 'xk':'perche', 'xke':'perche', 'art':'articolo',
             'tv':'televisore', '€':'euro', 'xché':'perché', 'xké':'perché', 'pkè':'perché'} 

In [86]:
def replace_abbreviation(tokens):
    result = [] 
    
    for word in tokens:
        if word.lower() in abbr_word:
            result.append(abbr_word[word.lower()])
        else:
            result.append(word)
    
    return result

In [87]:
df['tokens'] = df['tokens'].apply(replace_abbreviation)

<h3> Replacing Acronyms

In [88]:
acronyms = {'unhcr':['alto', 'commissariato', 'nazioni', 'unite', 'rifugiati'], 
            'onu':['organizzazione', 'delle', 'nazioni', 'unite'],
            'fdi':['fratelli', 'italia'], 
            'msna':['minori', 'stranieri', 'accompagnati'], 
            'rdc':['reddito', 'di', 'cittadinanza'],
            'gus':['gruppo', 'umana', 'solidarieta'], 
            'sprar':['sistema', 'protezione', 'richiedenti', 'asilo'],
            'anpi':['associazione', 'nazionale', 'partigiani', 'italia'], 
            'anac':['autorita', 'nazionale', 'anticorruzione'],
            'lgbt':['lesbiche', 'gay', 'bisessuali', 'transgender'], 
            'ln':['lega', 'nord'], 
            'ue':['unione', 'europea'],
            'msf':['medici','senza','frontiere'], 
            'ispi':['istituto','studi','politica','internazionale'],
            'cpr':['centri','permanenza','rimpatri'], 
            'pd':['partito', 'democratico'], 
            'gc':['guardia', 'costiera'],
            'inps':['istituto','nazionale','previdenza','sociale'],
            'cdm':['consiglio', 'dei', 'ministri'], 
            'pdl':['popolo', 'della', 'liberta'], 
            'atac':['azienda', 'tramvie', 'autobus', 'comune', 'roma'],
            'tav':['treno', 'alta', 'velocita'], 
            'isee':['situazione', 'economica', 'equivalente'],
            'usa':['stati', 'uniti', 'd', 'america'], 
            'onlus':['organizzazione', 'lucrativa', 'utilita', 'sociale'],
            'acsim':['associazione', 'centro', 'servizi', 'immigrati', 'marche'], 
            'aids':['sindrome', 'immuno', 'deficienza', 'acquisita'], 
            'eu':['unione', 'europea'],
            'ong':['organizzazione', 'governativa'], 
            'nwo':['nuovo', 'ordine', 'mondiale'],
            'pil':['prodotto', 'interno', 'lordo'], 
            'cgil':['confederazione', 'generale', 'lavoro'],
            'cdt':['corriere', 'ticino'], 
            'ptv':['societa', 'televisiva', 'pakistan'],
            'syriza':['coalizione', 'sinistra', 'radicale'], 
            'fiom':['federazione', 'impiegati', 'operai', 'metallurgici'],
            'lgbtq':['lesbiche', 'gay', 'bisessuali', 'transgender', 'queer'], 
            'rpl':['radio', 'padania', 'libera'],
            'arci':['associazione', 'ricreativa', 'culturale', 'italiana'],
            'ofcs':['osservatorio', 'focus', 'cultura', 'sicurezza'],
            'm5s':['movimento', 'cinque', 'stelle'],
            'wm5s':['movimento', 'cinque', 'stelle'],
            'mef':['ministero', 'dell', 'economia', 'e', 'delle', 'finanze'],
            'cnel':['consiglio', 'nazionale', 'dell', 'economia', 'e', 'del', 'lavoro'],
            'fdian':['fratelli', 'di', 'italia', 'alleanza', 'nazionale'],
            'ecm':['educazione', 'continua', 'in', 'medicina'],
            'cie':['carta', 'di', 'identità', 'elettronica'],
            'tg':['telegiornale'],
            'rai':['radiotelevisione', 'italiana'],
            'anpal':['agenzia', 'nazionale', 'politiche', 'attive', 'lavoro'],
            'def':['documento', 'di', 'economia', 'e', 'finanza'],
            'cr':['consiglio', 'regionale'],
            'ama':['azienda', 'municipale', 'ambiente'],
            'cesedi':['centro', 'servizi', 'didattici'],
            'ffoo':['forze', 'dell', 'ordine'],
            'reyn':['rete', 'per', 'la', 'prima', 'infanzia', 'rom'],
            'rmc':['radio', 'monte', 'carlo'],
            'ddl':['disegno', 'di', 'legge']}

In [89]:
def replace_acronyms(tokens):
    for i in range(0, len(tokens)):
        word = tokens[i]
        if word.lower() in acronyms:
            tokens[i] = acronyms[word.lower()][0]
            if len(acronyms[word.lower()]) > 1:
                for j in range(1, len(acronyms[word.lower()])):
                    tokens.insert(i+j, acronyms[word.lower()][j])
    return tokens

In [90]:
df['tokens'] = df['tokens'].apply(replace_acronyms)

<h3> Replacing other emojis

In [91]:
symbols = {'✔':['segno', 'di', 'spunta'],
           '♻':['simbolo', 'del', 'riciclaggio'],
           '▶':['pulsante', 'di', 'riproduzione'],
           '🖊':['penna', 'a', 'sfera'],
           '❤':['cuore', 'rosso']}

In [92]:
def replace_others_emojis(tokens):
    for i in range(0, len(tokens)):
        word = tokens[i]
        if word in symbols:
            tokens[i] = symbols[word][0]
            if len(symbols[word]) > 1:
                for j in range(1, len(symbols[word])):
                    tokens.insert(i+j, symbols[word][j])
    return tokens

In [93]:
df['tokens'] = df['tokens'].apply(replace_others_emojis)

<h3> Feature extraction: percentage of Bad Words

In [94]:
def percentage_bad_words(tokens):
    n_words = 0
    n_bad_words = 0
    
    for word in tokens:
        if word != '<' and word != '>':
            n_words = n_words + 1
    
    for word in tokens:
        if word.lower() in bad_words_dict:
            n_bad_words = n_bad_words + 1
        
    return ((n_bad_words*100)//n_words)

In [95]:
df['%bad_words'] = df['tokens'].apply(percentage_bad_words)

<h3> Stemming

In [106]:
stemmer = SnowballStemmer('italian')

In [107]:
def stemming(tokens):
    result = []
        
    for word in tokens:
        if word != '<' and word != '>':
            stemmed_word = stemmer.stem(word)
            result.append(stemmed_word)
        else:
            result.append(word)
                
    return result

In [108]:
df['stem'] = df['tokens'].apply(stemming)

<h3> Saving the dataset

In [96]:
#Vedo la differenza tra le varie liste
count = 0

for i in range(0, len(df)):
    row = df.iloc[i]
    len_token = len(row['tokens'])
    len_lemma = len(row['lemma'])
    len_pos = len(row['pos'])
    len_polarity = len(row['word_polarity'])
    
    if (len_token != len_lemma) and (len_token != len_pos) and (len_token != len_polarity):
        count = count + 1
        
print('Rows: {}, differences: {}'.format(len(df), count))

Rows: 6837, differences: 4604


In [109]:
df

Unnamed: 0,id,text,hs,stereotype,text_length,hashtags,%CAPS-LOCK words,esclamations,questions,lemma,pos,dep,word_polarity,tokens,%bad_words,stem
0,2066,"È terrorismo anche questo , per mettere in uno stato di soggezione le persone e renderle innocue , mentre qualcuno . . .",0,0,118,0,4,0,0,"[essere, terrorismo, anche, questo, ,, per, mettere, in, uno, stato, di, soggezione, il, persona, e, rendere, lo, innocuo, ,, mentre, qualcuno, ., ., .]","[AUX, NOUN, ADV, PRON, PUNCT, ADP, VERB, ADP, DET, NOUN, ADP, NOUN, DET, NOUN, CCONJ, VERB, DET, NOUN, PUNCT, SCONJ, PRON, PUNCT, PUNCT, PUNCT]","[cop, ROOT, advmod, nsubj, punct, mark, advcl, case, det, obl, case, nmod, det, obj, cc, conj, det, obj, punct, mark, advcl, punct, punct, punct]","[neutral, negative, neutral, neutral, neutral, neutral, neutral, neutral, positive, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, positive, neutral, neutral, neutral, neutral, neutral, neutral]","[È, terrorismo, anche, questo, ,, per, mettere, in, uno, stato, di, soggezione, le, persone, e, renderle, innocue, ,, mentre, qualcuno, ., ., .]",0,"[è, terror, anche, quest, ,, per, mett, in, uno, stat, di, soggezion, le, person, e, rend, innocu, ,, mentr, qualcun, ., ., .]"
1,2045,infatti finché ci hanno guadagnato con i campi < rom > tutto era ok con < Alemanno > < Ipocriti >,0,0,93,3,0,0,0,"[infatti, finché, ci, avere, guadagnare, con, il, campo, <, rom, >, tutto, essere, ok, con, <, Alemanno, >, <, ipocriti, >]","[ADV, SCONJ, PRON, AUX, VERB, ADP, DET, NOUN, SYM, NOUN, SYM, PRON, AUX, NOUN, ADP, SYM, PROPN, SYM, SYM, PROPN, SYM]","[advmod, mark, obj, aux, advcl, case, det, obl, nmod, nmod, nmod, nsubj, cop, nmod, case, nmod, flat:name, nsubj, flat:name, flat:name, ROOT]","[neutral, neutral, neutral, negative, neutral, neutral, neutral, positive, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral]","[infatti, finché, ci, hanno, guadagnato, con, i, campi, <, rom, >, tutto, era, ok, con, <, Alemanno, >, <, Ipocriti, >]",0,"[infatt, finc, ci, hann, guadagn, con, i, camp, <, rom, >, tutt, era, ok, con, <, alemann, >, <, ipocr, >]"
2,61,"Corriere : Tangenti , Mafia Capitale dimenticata Mazzette su buche e campi rom < roma >",0,0,84,1,0,0,0,"[Corriere, :, Tangenti, ,, mafia, Capitale, dimenticare, mazzette, su, buca, e, campo, rom, <, roma, >]","[NOUN, PUNCT, PROPN, PUNCT, PROPN, PROPN, VERB, PROPN, ADP, NOUN, CCONJ, NOUN, NOUN, SYM, NOUN, SYM]","[nmod, punct, conj, punct, flat:name, flat:name, acl, flat:name, case, nmod, cc, conj, nmod, nmod, compound, ROOT]","[neutral, neutral, neutral, neutral, negative, neutral, neutral, neutral, neutral, negative, neutral, positive, neutral, neutral, neutral, neutral]","[Corriere, :, Tangenti, ,, Mafia, Capitale, dimenticata, Mazzette, su, buche, e, campi, rom, <, roma, >]",0,"[corr, :, tangent, ,, maf, capital, dimentic, mazzett, su, buch, e, camp, rom, <, rom, >]"
3,1259,"ad uno ad uno , perché quando i migranti israeliti arrivarono in terra di Canaan fecero fuori tutti i Canaaniti .",0,0,114,0,0,0,0,"[a, uno, di, uno, ,, perché, quando, il, migrante, israelire, arrivare, in, terra, di, Canaan, fare, fuori, tutto, il, Canaaniti, .]","[ADP, PRON, ADP, PRON, PUNCT, SCONJ, SCONJ, DET, NOUN, ADJ, VERB, ADP, NOUN, ADP, PROPN, VERB, ADV, DET, DET, PROPN, PUNCT]","[case, ROOT, case, nmod, punct, mark, mark, det, nsubj, amod, advcl, case, obl, case, nmod, advcl, advmod, det:predet, det, nsubj, punct]","[neutral, positive, neutral, positive, neutral, neutral, neutral, neutral, neutral, neutral, positive, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral]","[ad, uno, ad, uno, ,, perché, quando, i, migranti, israeliti, arrivarono, in, terra, di, Canaan, fecero, fuori, tutti, i, Canaaniti, .]",0,"[ad, uno, ad, uno, ,, perc, quand, i, migrant, israel, arriv, in, terr, di, canaan, fecer, fuor, tutt, i, canaan, .]"
4,949,Il divertimento del giorno ? Trovare i patrioti italiani che inneggiano contro i rom facendo la spesa alla < Lidl > ( multinazionale tedesca ) .,0,0,138,1,0,0,1,"[il, divertimento, di, il, giorno, ?, trovare, il, patriote, italiano, che, inneggiare, contro, il, rom, fare, il, spesa, a, il, <, Lidl, >, (, multinazionale, tedesco, ), .]","[DET, NOUN, ADP, DET, NOUN, PUNCT, VERB, DET, NOUN, ADJ, PRON, VERB, ADP, DET, NOUN, VERB, DET, NOUN, ADP, DET, SYM, PROPN, SYM, PUNCT, ADJ, ADJ, PUNCT, PUNCT]","[det, ROOT, case, det, nmod, punct, ROOT, det, obj, amod, nsubj, acl:relcl, case, det, obl, advcl, det, obj, case, det, obl, flat:name, flat:name, punct, amod, amod, punct, punct]","[neutral, neutral, neutral, neutral, neutral, neutral, positive, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral]","[Il, divertimento, del, giorno, ?, Trovare, i, patrioti, italiani, che, inneggiano, contro, i, rom, facendo, la, spesa, alla, <, Lidl, >, (, multinazionale, tedesca, ), .]",0,"[il, divert, del, giorn, ?, trov, i, patriot, italian, che, innegg, contr, i, rom, fac, la, spes, alla, <, lidl, >, (, multinazional, tedesc, ), .]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6832,9340,"Gli stati nazionali devono essere pronti a rinunciare alla propria sovranità . Lo ha detto la Merkel , che ha aggiunto che gli stati nazionali non devono ascoltare la volontà dei loro cittadini quando si tratta di questioni che riguardano immigrazione , confini , o persino sovranità",0,0,283,0,0,0,0,"[il, stato, nazionale, dovere, essere, pronto, a, rinunciare, a, il, proprio, sovranità, ., lo, avere, dire, il, Merkel, ,, che, avere, aggiungere, che, il, stato, nazionale, non, dovere, ascoltare, il, volontà, di, il, loro, cittadino, quando, si, trattare, di, questione, che, riguardare, immigrazione, ,, confine, ,, o, persino, sovranità]","[DET, NOUN, ADJ, AUX, AUX, ADJ, ADP, VERB, ADP, DET, DET, NOUN, PUNCT, PRON, AUX, VERB, DET, PROPN, PUNCT, PRON, AUX, VERB, SCONJ, DET, NOUN, ADJ, ADV, AUX, VERB, DET, NOUN, ADP, DET, DET, NOUN, SCONJ, PRON, VERB, ADP, NOUN, PRON, VERB, NOUN, PUNCT, NOUN, PUNCT, CCONJ, ADV, NOUN]","[det, nsubj, amod, aux, cop, ROOT, mark, advcl, case, det, det:poss, obl, punct, obj, aux, ROOT, det, nsubj, punct, nsubj, aux, acl:relcl, mark, det, nsubj, amod, advmod, aux, ccomp, det, obj, case, det, det:poss, nmod, mark, expl, advcl, case, obl, nsubj, acl:relcl, obj, punct, conj, punct, cc, advmod, conj]","[neutral, neutral, neutral, neutral, neutral, positive, neutral, neutral, neutral, neutral, positive, neutral, neutral, neutral, negative, negative, neutral, neutral, neutral, neutral, negative, neutral, neutral, neutral, neutral, neutral, None, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, negative, neutral, neutral, neutral, neutral]","[Gli, stati, nazionali, devono, essere, pronti, a, rinunciare, alla, propria, sovranità, ., Lo, ha, detto, la, Merkel, ,, che, ha, aggiunto, che, gli, stati, nazionali, non, devono, ascoltare, la, volontà, dei, loro, cittadini, quando, si, tratta, di, questioni, che, riguardano, immigrazione, ,, confini, ,, o, persino, sovranità]",0,"[gli, stat, nazional, dev, esser, pront, a, rinunc, alla, propr, sovran, ., lo, ha, dett, la, merkel, ,, che, ha, aggiunt, che, gli, stat, nazional, non, dev, ascolt, la, volont, dei, lor, cittadin, quand, si, tratt, di, question, che, riguard, immigr, ,, confin, ,, o, persin, sovran]"
6833,9121,"Il ministro dell' interno della Germania < Horst Sehofer > , sta facendo la proposta di dare soldi agli immigrati che vogliono tornare a casa e aiutarli a creare un' attività a casa loro e fare business con la Germania . Chi paga ? Una parte i crucchi e il resto l' Europa , cioè io e voi !",0,0,277,1,0,1,1,"[il, ministro, di, il, interno, di, il, Germania, <, Horst, Sehofer, >, ,, stare, fare, il, proposta, di, dare, soldo, a, il, immigrato, che, volere, tornare, a, casa, e, aiutare, li, a, creare, uno, attività, a, casa, loro, e, fare, business, con, il, Germania, ., chi, pagare, ?, uno, parte, il, crucco, e, il, resto, il, Europa, ,, cioè, io, e, voi, !]","[DET, NOUN, ADP, DET, NOUN, ADP, DET, PROPN, SYM, PROPN, PROPN, SYM, PUNCT, AUX, VERB, DET, NOUN, ADP, VERB, NOUN, ADP, DET, NOUN, PRON, AUX, VERB, ADP, NOUN, CCONJ, VERB, PRON, ADP, VERB, DET, NOUN, ADP, NOUN, PRON, CCONJ, VERB, NOUN, ADP, DET, PROPN, PUNCT, PRON, VERB, PUNCT, DET, NOUN, DET, NOUN, CCONJ, DET, NOUN, DET, PROPN, PUNCT, CCONJ, PRON, CCONJ, PRON, PUNCT]","[det, nsubj, case, det, nmod, case, det, nmod, nmod, flat:name, flat:name, flat:name, punct, aux, ROOT, det, obj, mark, acl, obj, case, det, obl, nsubj, aux, acl:relcl, case, obl, cc, conj, obj, mark, xcomp, det, obj, case, obl, nmod, cc, conj, obj, case, det, nmod, punct, nsubj, ROOT, punct, det, ROOT, det, nmod, cc, det, conj, det, nmod, punct, cc, conj, cc, conj, punct]","[neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, negative, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, positive, neutral, positive, neutral, neutral, neutral, positive, neutral, neutral, positive, neutral, neutral, neutral, positive, neutral, neutral, neutral, neutral, neutral, neutral, neutral, positive, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, positive, neutral, neutral, neutral]","[Il, ministro, dell', interno, della, Germania, <, Horst, Sehofer, >, ,, sta, facendo, la, proposta, di, dare, soldi, agli, immigrati, che, vogliono, tornare, a, casa, e, aiutarli, a, creare, un', attività, a, casa, loro, e, fare, business, con, la, Germania, ., Chi, paga, ?, Una, parte, i, crucchi, e, il, resto, l', Europa, ,, cioè, io, e, voi, !]",0,"[il, ministr, dell', intern, dell, german, <, horst, sehofer, >, ,, sta, fac, la, propost, di, dar, sold, agli, immigr, che, vogl, torn, a, cas, e, aiut, a, cre, un', attiv, a, cas, lor, e, far, business, con, la, german, ., chi, pag, ?, una, part, i, crucc, e, il, rest, l', europ, ,, cio, io, e, voi, !]"
6834,8549,"< Salvini > : In Italia troppi si sono montati la testa , io ringrazio Dio e voi per questi mesi straordinari . Vi raccontavano che su immigrazione non si poteva fare nulla , è bastato usare buonsenso e coraggio . < io ci sono > < piazza del popolo >",0,0,233,3,0,0,0,"[<, Salvini, >, :, in, Italia, troppo, si, essere, montare, il, testa, ,, io, ringrazio, Dio, e, voi, per, questo, mese, straordinario, ., vi, raccontare, che, su, immigrazione, non, si, potere, fare, nulla, ,, essere, bastare, usare, buonsenso, e, coraggio, ., <, io, ci, essere, >, <, piazza, di, il, popolo, >]","[SYM, PROPN, SYM, PUNCT, ADP, PROPN, DET, PRON, AUX, VERB, DET, NOUN, PUNCT, PRON, VERB, NOUN, CCONJ, PRON, ADP, DET, NOUN, ADJ, PUNCT, PRON, VERB, SCONJ, ADP, NOUN, ADV, PRON, AUX, VERB, PRON, PUNCT, AUX, VERB, VERB, NOUN, CCONJ, NOUN, PUNCT, SYM, PRON, PRON, VERB, SYM, SYM, NOUN, ADP, DET, NOUN, SYM]","[ROOT, flat:name, flat:name, punct, case, obl, nsubj, expl, aux, ROOT, det, obj, punct, nsubj, advcl, obj, cc, conj, case, det, obl, amod, punct, iobj, ROOT, mark, case, obl, advmod, expl:impers, aux, ccomp, obj, punct, aux, conj, xcomp, obj, cc, conj, punct, obl, nsubj, expl, ROOT, nsubj, flat:name, compound, case, det, nmod, flat:name]","[neutral, neutral, neutral, neutral, neutral, neutral, negative, neutral, neutral, neutral, neutral, positive, neutral, positive, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, negative, neutral, neutral, neutral, None, neutral, neutral, neutral, negative, neutral, neutral, positive, neutral, positive, neutral, positive, neutral, neutral, positive, neutral, neutral, neutral, neutral, positive, neutral, neutral, neutral, neutral]","[<, Salvini, >, :, In, Italia, troppi, si, sono, montati, la, testa, ,, io, ringrazio, Dio, e, voi, per, questi, mesi, straordinari, ., Vi, raccontavano, che, su, immigrazione, non, si, poteva, fare, nulla, ,, è, bastato, usare, buonsenso, e, coraggio, ., <, io, ci, sono, >, <, piazza, del, popolo, >]",0,"[<, salvin, >, :, in, ital, tropp, si, son, mont, la, test, ,, io, ringraz, dio, e, voi, per, quest, mes, straordinar, ., vi, raccont, che, su, immigr, non, si, pot, far, null, ,, è, bast, usar, buonsens, e, coragg, ., <, io, ci, son, >, <, piazz, del, popol, >]"
6835,9240,Chi giubila in buona fede non ha capito niente . Purtroppo credo che i più non siano in buona fede . I migranti sono un grosso business e chi finora li ha voluti non vuole perdere questo guadagno,0,0,198,0,2,0,0,"[chi, giubilare, in, buono, fede, non, avere, capire, niente, ., purtroppo, credere, che, il, più, non, essere, in, buono, fede, ., il, migrante, essere, uno, grosso, business, e, chi, finora, li, avere, volere, non, volere, perdere, questo, guadagno]","[PRON, VERB, ADP, ADJ, NOUN, ADV, AUX, VERB, PRON, PUNCT, ADV, VERB, SCONJ, DET, ADV, ADV, AUX, ADP, ADJ, NOUN, PUNCT, DET, NOUN, AUX, DET, ADJ, NOUN, CCONJ, PRON, ADV, PRON, AUX, VERB, ADV, AUX, VERB, DET, NOUN]","[nsubj, advcl, case, amod, obl, advmod, aux, ROOT, obj, punct, advmod, ROOT, mark, det, advmod, advmod, cop, case, amod, ccomp, punct, det, nsubj, cop, det, amod, ROOT, cc, nsubj, advmod, obj, aux, acl:relcl, advmod, aux, conj, det, obj]","[neutral, positive, neutral, positive, positive, None, negative, positive, negative, neutral, neutral, positive, neutral, neutral, neutral, None, neutral, neutral, positive, positive, neutral, neutral, neutral, neutral, positive, neutral, positive, neutral, neutral, neutral, neutral, negative, neutral, None, neutral, negative, neutral, positive]","[Chi, giubila, in, buona, fede, non, ha, capito, niente, ., Purtroppo, credo, che, i, più, non, siano, in, buona, fede, ., I, migranti, sono, un, grosso, business, e, chi, finora, li, ha, voluti, non, vuole, perdere, questo, guadagno]",0,"[chi, giubil, in, buon, fed, non, ha, cap, nient, ., purtropp, cred, che, i, più, non, sian, in, buon, fed, ., i, migrant, son, un, gross, business, e, chi, finor, li, ha, vol, non, vuol, perd, quest, guadagn]"


In [110]:
df.to_csv('new_modified_df.csv', index=False)