In [1]:
# nltk.download('stopwords')
# nltk.download('wordnet')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
import nltk
import it_core_news_sm
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from wordcloud import WordCloud


# Columns used to classify comments

**Sentiment comment variables:**
 + c_rating3
 + c_rating

**Comment related** <br>
c_text : contains the comments text <br>
c_rating : evaluation of the comment (positive, problematic, negative...) <br>
c_ratingCivile : is it a respectful comment (respectful/disrespectful) <br>
c_ratingPosNeg : positive or negative attitude wrt post <br>
c_category : topic of the comments (muslim refugees, muslims...) <br>

In [2]:
database = pd.read_csv("database/id_lemmas.csv",index_col = 0 , sep=',', engine='python')
database['text_nlp'] = database.apply(lambda row: word_tokenize(row['text_nlp']), axis = 1)
database

Unnamed: 0,text_nlp
0,"[diro, gente, guerra, portato, casa, rimpatriare]"
1,"[marcello, perfavore]"
2,"[patrio, patriota, difeso, proprio, radice, so..."
3,"[musulmano, comandare, casa]"
4,"[odiare, dipendere, odiare, comandare, librare]"
...,...
78170,"[vedere, ruspa, inviato, salvini, ripulire, am..."
78171,"[fino, sentire, sempre, richiamare, africo, pa..."
78172,"[sparare, ladro, occhio, penna, barbetta]"
78173,"[consigliatissima, africo]"


In [3]:
attributes_to_keep = ['c_rating3', 'c_rating']

In [4]:
database_comments = pd.read_csv("database/com_liwc.csv", sep='\t', engine='python')
database_comments.head(2)

Unnamed: 0,Origin_file_order,Site,p_id,dateCreated,p_politician,p_gender,p_GRUPPO_PE,p_LISTA,p_PARTITO,p_governo,...,p_Comma,p_Colon,p_SemiC,p_Qmark,p_Exclam,p_Dash,p_Quote,p_Apostro,p_Parenth,p_OtherP
0,30126,FB,96844400700_10157493758850701,2019-04-23T10:33:37Z,MARCELLO GEMMATO,M,PPE,FDI,FDI,opposizione,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30226,FB,96844400700_10157493758850701,2019-04-23T10:33:37Z,MARCELLO GEMMATO,M,PPE,FDI,FDI,opposizione,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
database_comments_attr = database_comments[attributes_to_keep]
#free memory
del database_comments
database_comments_attr.shape

(78175, 2)

Keep the comments that have the same index, since some of them might have been canceled during the text cleaning. 


In [6]:
database_comments_attr = database_comments_attr.iloc[database.index,:]
print("Shape of the database:", database_comments_attr.shape)
database_comments_attr.head()

Shape of the database: (75775, 2)


Unnamed: 0,c_rating3,c_rating
0,probl-hate,problematico
1,probl-hate,problematico
2,probl-hate,problematico
3,probl-hate,problematico
4,probl-hate,problematico


## C_rating3
Using only c_rating3 as column of attributes to be kept

In [7]:
list_columns = list(database_comments_attr.c_rating3.unique())
list_columns.insert(0, 'word') 

df_word_c_rating3 = pd.DataFrame(columns = list_columns)
df_word_c_rating3

Unnamed: 0,word,probl-hate,positivo,negativo


In [8]:
database_attr = database.join(database_comments_attr)
database_attr.head()

Unnamed: 0,text_nlp,c_rating3,c_rating
0,"[diro, gente, guerra, portato, casa, rimpatriare]",probl-hate,problematico
1,"[marcello, perfavore]",probl-hate,problematico
2,"[patrio, patriota, difeso, proprio, radice, so...",probl-hate,problematico
3,"[musulmano, comandare, casa]",probl-hate,problematico
4,"[odiare, dipendere, odiare, comandare, librare]",probl-hate,problematico


In [9]:
unique_words = {}
words_hate   = {}
words_neg    = {}
words_pos    = {}

for index, row in database_attr.iterrows():
    if(len(row['text_nlp'])>0):
        for single_word in row['text_nlp']:
            unique_words.setdefault(single_word, 0) ## setdefault() method returns the value of a key (if the key is in dictionary)
            unique_words[single_word] += 1
            
            if row['c_rating3'] == 'probl-hate':
                words_hate.setdefault(single_word, 0) ## setdefault() method returns the value of a key (if the key is in dictionary)
                words_hate[single_word] += 1
            
            elif row['c_rating3'] == 'positivo':
                words_pos.setdefault(single_word, 0) ## setdefault() method returns the value of a key (if the key is in dictionary)
                words_pos[single_word] += 1
            
            elif row['c_rating3'] == 'negativo':
                words_neg.setdefault(single_word, 0) ## setdefault() method returns the value of a key (if the key is in dictionary)
                words_neg[single_word] += 1

For every word that is present in unique words dictionary, take the number of occurrences of that word in a given type of comment.

In [10]:
for word in unique_words.keys():
    row_dict = {'word' : word,
                'probl-hate': words_hate.get(word),
                'positivo'  : words_neg.get(word),
                'negativo'  : words_pos.get(word)}
    df_word_c_rating3 = df_word_c_rating3.append(row_dict, ignore_index=True )
    
df_word_c_rating3 = df_word_c_rating3.fillna(0)
df_word_c_rating3

Unnamed: 0,word,probl-hate,positivo,negativo
0,diro,125,748,734
1,gente,172,580,572
2,guerra,94,218,257
3,portato,18,59,41
4,casa,330,714,508
...,...,...,...,...
39357,rip,0,0,1
39358,aspettandoprometeo,0,0,1
39359,ivreich,0,0,1
39360,satanasso,0,0,1


In [11]:
df_word_c_rating3.to_csv('words_with_ratings3.csv', index = False)

## C_rating
Using only c_rating3 as column of attributes to be kept

In [12]:
list_columns = list(database_comments_attr.c_rating.unique())
list_columns.insert(0, 'word') 

df_word_c_rating = pd.DataFrame(columns = list_columns)
df_word_c_rating

Unnamed: 0,word,problematico,positivo,negativo,hate,ambiguo


In [13]:
database_attr = database.join(database_comments_attr)
database_attr.head()

Unnamed: 0,text_nlp,c_rating3,c_rating
0,"[diro, gente, guerra, portato, casa, rimpatriare]",probl-hate,problematico
1,"[marcello, perfavore]",probl-hate,problematico
2,"[patrio, patriota, difeso, proprio, radice, so...",probl-hate,problematico
3,"[musulmano, comandare, casa]",probl-hate,problematico
4,"[odiare, dipendere, odiare, comandare, librare]",probl-hate,problematico


In [14]:
unique_words = {}
words_hate   = {}
words_ambig  = {}
words_prob   = {}
words_neg    = {}
words_pos    = {}

for index, row in database_attr.iterrows():
    if(len(row['text_nlp'])>0):
        
        for single_word in row['text_nlp']:
            unique_words.setdefault(single_word, 0) ## setdefault() method returns the value of a key (if the key is in dictionary)
            unique_words[single_word] += 1
            
            if row['c_rating'] == 'hate':
                words_hate.setdefault(single_word, 0) ## setdefault() method returns the value of a key (if the key is in dictionary)
                words_hate[single_word] += 1
            
            elif row['c_rating'] == 'positivo':
                words_pos.setdefault(single_word, 0) ## setdefault() method returns the value of a key (if the key is in dictionary)
                words_pos[single_word] += 1
            
            elif row['c_rating'] == 'negativo':
                words_neg.setdefault(single_word, 0) ## setdefault() method returns the value of a key (if the key is in dictionary)
                words_neg[single_word] += 1
            
            elif row['c_rating'] == 'ambiguo':
                words_ambig.setdefault(single_word, 0) ## setdefault() method returns the value of a key (if the key is in dictionary)
                words_ambig[single_word] += 1
            
            elif row['c_rating'] == 'problematico':
                words_prob.setdefault(single_word, 0) ## setdefault() method returns the value of a key (if the key is in dictionary)
                words_prob[single_word] += 1

In [15]:
for word in unique_words.keys():
    row_dict = {'word' : word,
                'problematico': words_prob.get(word),
                'positivo'    : words_hate.get(word),
                'negativo'    : words_pos.get(word),
                'hate'        : words_neg.get(word),
                'ambiguo'     : words_ambig.get(word)}
    df_word_c_rating = df_word_c_rating.append(row_dict, ignore_index=True )
    
df_word_c_rating = df_word_c_rating.fillna(0)
df_word_c_rating

Unnamed: 0,word,problematico,positivo,negativo,hate,ambiguo
0,diro,123,2,722,748,12
1,gente,156,16,569,580,3
2,guerra,92,2,248,218,9
3,portato,15,3,40,59,1
4,casa,300,30,498,714,10
...,...,...,...,...,...,...
39357,rip,0,0,0,0,1
39358,aspettandoprometeo,0,0,0,0,1
39359,ivreich,0,0,0,0,1
39360,satanasso,0,0,0,0,1


In [16]:
df_word_c_rating.to_csv('words_with_ratings5.csv', index = False)

## Politician name