-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment_functions_3_6.py
345 lines (300 loc) · 18.3 KB
/
sentiment_functions_3_6.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#%%
import cgcn_functions_3_6 as cgcn_functions
import numpy as np
from numpy.lib.shape_base import column_stack
import pandas as pd
from senticnet.senticnet import SenticNet
from senticnet.babelsenticnet import BabelSenticNet
import time
#%%############################ SenticNet
############################### Input: single lemma, ex: love
def get_SenticNet_concept(lempos, language):
# '''for a concept in a language get SenticNet values'''
if language=='deu':
language='de'
if language=='en':
try:
sn = SenticNet()
lemma_info = sn.concept(lempos[0:-2])
except:
pass
else:
try:
bsn = BabelSenticNet(language)
lemma_info = bsn.concept(lempos[0:-2])
except:
pass
return (lempos, lemma_info)#, polarity_label, polarity_value, moodtags, semantics, sentics)
#%%
# ''' output - dictionary:
# {'polarity_label': 'negative',
# 'polarity_value': '-0.88',
# 'sentics': {'introspection': '0', 'temper': '0', 'attitude': '-0.89', 'sensitivity': '-0.86'},
# 'moodtags': ['#disgust', '#fear'],
# 'semantics': ['circuit', 'dismissal', 'needle', 'superfluity', 'redundancy']}
# '''
#%%
def get_SenticNet_concept_df(lempos, language):
# '''for a concept list in a language get dataframe of SenticNet values'''
# this is similar as get_SenticNet_concept(concept, language)
if language=='deu':
language='de'
if language=='en':
try:
sn = SenticNet()
lemma_info = sn.concept(lempos.lower()[0:-2])
except:
pass
else:
try:
bsn = BabelSenticNet(language)
lemma_info = bsn.concept(lempos.lower()[0:-2])
except:
pass
df=pd.DataFrame()
try:
df= df.from_dict(lemma_info, orient='index').transpose() #https://stackoverflow.com/questions/40442014/python-pandas-valueerror-arrays-must-be-all-same-length
#assign sentics dictionary keys to df columns
df= df.drop(columns='sentics', axis= 1).assign(**pd.DataFrame(df.sentics.values.tolist())) #https://stackoverflow.com/questions/39640936/parsing-a-dictionary-in-a-pandas-dataframe-cell-into-new-row-cells-new-columns
#add label
df['label']= lempos
except:
# if no value, just return label
# if language=='en':
# df=pd.DataFrame(columns=['label', 'polarity_label', 'polarity_value', 'introspection', 'temper', 'attitude', 'sensitivity', 'moodtags', 'semantics'])
# else:
# df=pd.DataFrame(columns=['label', 'polarity_label', 'polarity_value', 'pleasantness', 'attention', 'aptitude', 'sensitivity', 'moodtags', 'semantics'])
# df= pd.concat([df, pd.Series()], ignore_index=True)
df= pd.concat([df, pd.Series(dtype='float64')], ignore_index=True)
df['label'] = lempos
return (df)
# get_SenticNet_concept_df('blood-n', 'en')
# get_SenticNet_concept_df('ženstvenost-n', 'hr')
#%%################################### Input: List of lemmas
def get_SenticNet_c_list(concept_list, language):
# '''for a concept list in a language get list of SenticNet values'''
concept_values=[]
for item in concept_list:
try:
concept_values.append(get_SenticNet_concept(item, language))
except:
pass
return concept_values
# get_SenticNet_c_list(['krv-n', 'znoj-n', 'rigatoni-n'], 'hr')
#%%
def get_SenticNet_c_df(concept_list, language):
# '''for a concept list in a language get dataframe SenticNet values'''
concept_values=pd.DataFrame()
for item in concept_list:
try:
concept_values= pd.concat([concept_values, get_SenticNet_concept_df(item, language)])
# concept_values= concept_values.append(get_SenticNet_concept_df(item, language))
except:
pass
return concept_values
# d=get_SenticNet_c_df(['krv-n', 'meso-n', 'urin-n'], 'hr')
# d=get_SenticNet_c_df(['blood-n', 'flesh-n', 'urea-n'], 'en')
# for k in d.keys():
# if not k in ['label', 'moodtags', 'semantics', 'polarity_label']:
# print(k)
#%% senticnet srednjica
def get_sentic_srednjica_df(SenticNet_c_df, node_importance_values_list, node_importance_measure):
# # for a dataframe of sentic values, nodes in a graph, measure of node importance get dataframe of sentic_values
# # take list of concepts, values, and its node importance and return the middle value
# # SenticNet_c_df je df sa lemama za koju se traži vrijednost,
# # node importance_values_list je lista vrijednosti važnosti čvora,
# # node_importanc_measure je naziv mjere značaja čvora u grafu
SenticNet_c_df=SenticNet_c_df.dropna()
sentic_df =pd.DataFrame(index=[node_importance_measure])
for key in SenticNet_c_df.keys(): # for every column create srednjica
if not key in ['label', 'moodtags', 'semantics', 'polarity_label']:
sentic_df[key]=cgcn_functions.srednjica([float(x) for x in SenticNet_c_df[key].tolist()], [float(x) for x in node_importance_values_list] )
return sentic_df
def get_sentic_srednjica_comm(SenticNet_c_df, node_importance_values_list, node_importance_measure):
# # for a dataframe of sentic values, nodes in a graph, measure of node importance get dataframe of sentic_values
# # take list of concepts, values, and its node importance and return the middle value
# # SenticNet_c_df je df sa lemama za koju se traži vrijednost,
# # node importance_values_list je lista vrijednosti važnosti čvora,
# # node_importanc_measure je naziv mjere značaja čvora u grafu
df=pd.DataFrame()
for measure in node_importance_measure:
vdict={}
for key in SenticNet_c_df.keys(): # for every column create srednjica
if not key in ['label', 'moodtags', 'semantics', 'polarity_label']: #exclude these labels
val_list= SenticNet_c_df[key].astype(float).tolist()
node_importance_val_list = [float(x) for x in node_importance_values_list[measure].astype(float).tolist()]
v = cgcn_functions.srednjica(val_list, node_importance_val_list)
vdict.update({key:v})
# df= pd.DataFrame( index=[0]).from_dict(vdict)
mvdict={measure:vdict}
df_mvdict=pd.DataFrame().from_dict(mvdict).T
df=pd.concat([df,df_mvdict])
# df_mvdict= cgcn_functions.col_order(df_mvdict, ['polarity_value'])
# SenticNet_c_df=SenticNet_c_df.dropna()
# sentic_df =pd.DataFrame(index=[node_importance_measure])
# sentic_df[key]=cgcn_functions.srednjica([float(x) for x in SenticNet_c_df[key].tolist()], [float(x) for x in node_importance_values_list] )
return df#df_mvdict# v# sentic_df
# def get_sentic_srednjica_df(SenticNet_c_df, node_importance_values_list, node_importance_measure):
# # take list of concepts, values, and its node importance and return the middle value
# # SenticNet_c_df je df sa lemama za koju se traži vrijednost,
# # node importance_values_list je lista vrijednosti važnosti čvora,
# # node_importanc_measure je naziv mjere značaja čvora u grafu
# SenticNet_c_df=SenticNet_c_df.dropna()
# sentic_df =pd.DataFrame(index=[node_importance_measure])
# sentic_df['polarity_value']=cgcn_functions.srednjica([float(x) for x in SenticNet_c_df['polarity_value']], node_importance_values_list )
# sentic_df['introspection']=cgcn_functions.srednjica([float(x) for x in SenticNet_c_df['introspection']], node_importance_values_list )
# sentic_df['temper']=cgcn_functions.srednjica([float(x) for x in SenticNet_c_df['temper']], node_importance_values_list )
# sentic_df['attitude']=cgcn_functions.srednjica([float(x) for x in SenticNet_c_df['attitude']], node_importance_values_list )
# sentic_df['sensitivity']=cgcn_functions.srednjica([float(x) for x in SenticNet_c_df['sensitivity']], node_importance_values_list )
# return sentic_df
def get_graph_sentic_values(SenticNet_c_df, graph_df):
# for a dataframe of sentic values, nodes in a graph get dataframe of sentic_values according to a listOf node_importance_measures
node_importance_measures=['sli', 'pagerank', 'degree', 'weighted_degree', 'betweenness']
graph_sentic_values = pd.DataFrame()
try:
for node_importance_measure in node_importance_measures:
# get node importance from graph_DataFrame for all labels containing some value in sentic_concepts_df
node_importance_values_list = graph_df[graph_df["label"].isin([str(x) for x in SenticNet_c_df[SenticNet_c_df['polarity_value'].notna()]['label']])][node_importance_measure].astype('float').tolist()
graph_sentic_values= pd.concat([graph_sentic_values, pd.DataFrame(get_sentic_srednjica_df(SenticNet_c_df, node_importance_values_list, node_importance_measure))])
# graph_sentic_values['kojiBogGledas']= node_importance_measure
except:
pass
return (graph_sentic_values)
#%%
def make_sentic_df(sentic_list, node_importance_values_list, node_importance_measure, language):
try:
# take list of concepts, values, and its node importance and return the middle value
# sentic_list je lista lema za koju se traži vrijednost,
# node importance_values_list je lista vrijednosti važnosti čvora,
# node_importanc_measure je naziv mjere značaja čvora u grafu
sentic_df =pd.DataFrame(index=[node_importance_measure])
if language=='en':
sentic_df['polarity_value']=cgcn_functions.srednjica([float(x[1]['polarity_value']) for x in sentic_list], node_importance_values_list )
sentic_df['introspection']=cgcn_functions.srednjica([float(x[1]['sentics']['introspection']) for x in sentic_list], node_importance_values_list )
sentic_df['temper']=cgcn_functions.srednjica([float(x[1]['sentics']['temper']) for x in sentic_list], node_importance_values_list )
sentic_df['attitude']=cgcn_functions.srednjica([float(x[1]['sentics']['attitude']) for x in sentic_list], node_importance_values_list )
sentic_df['sensitivity']=cgcn_functions.srednjica([float(x[1]['sentics']['sensitivity']) for x in sentic_list], node_importance_values_list )
else:
sentic_df['polarity_value']=cgcn_functions.srednjica([float(x[1]['polarity_value']) for x in sentic_list], node_importance_values_list )
sentic_df['pleasantness']=cgcn_functions.srednjica([float(x[1]['sentics']['introspection']) for x in sentic_list], node_importance_values_list )
sentic_df['attention']=cgcn_functions.srednjica([float(x[1]['sentics']['temper']) for x in sentic_list], node_importance_values_list )
sentic_df['aptitude']=cgcn_functions.srednjica([float(x[1]['sentics']['aptitude']) for x in sentic_list], node_importance_values_list )
sentic_df['sensitivity']=cgcn_functions.srednjica([float(x[1]['sentics']['sensitivity']) for x in sentic_list], node_importance_values_list )
except:
pass
finally:
pass
return sentic_df
#%%
def calculate_sentic_values(listOfLemmas, pos, language, measure, corpus, corpusID, gramRel, limit_snp_f, limit_snp_fof, harvestSelect):
# for a lemma in df_senticnet get sourceValuePropagation 2- napraviti izračun po metodi - sourceValuePropagation: val(sVP)
# pos = -n, -v, -j
# start=time.time() # measure time
sentic_calculated_values=pd.DataFrame() # create a DataFrame
for lemma in listOfLemmas:
lemma= lemma+pos # assign a pos to search in corpus lempos
if not '_' in lemma: #exclude based on _
try:
# f_snp_friends
f_snp_df = cgcn_functions.friends_df(language, lemma[0:-2], lemma[-2:], corpus, corpusID, gramRel, lemma[-2:], limit_snp_f, measure, harvestSelect, 'undirected')
# fof_snp_friends
fof_snp_df= cgcn_functions.FoFData(language, f_snp_df, lemma[0:-2], lemma[-2:], corpus, corpusID, gramRel, lemma[-2:], limit_snp_fof, measure, harvestSelect, 'undirected')
fof_snp_graph = cgcn_functions.FoF_graph(fof_snp_df, measure)
fof_snp_graph_df = cgcn_functions.df_from_graph(fof_snp_graph)
# get sentic values for lexical nodes in FoF
sentic_snp = get_SenticNet_c_df(fof_snp_graph.vs["label"], language)
# get sentic value for a lemma in dictionary based on the 'sentic_snp '+lemma, sentic_snp
sentic_snp_value = get_graph_sentic_values(sentic_snp, fof_snp_graph_df)
sentic_snp_value['label'] = lemma
# sentic_calculated_values = pd.concat([sentic_calculated_values, sentic_snp_value])
sentic_calculated_values = sentic_calculated_values.append(sentic_snp_value)
except:
pass
else:
pass
return sentic_calculated_values
# calculate_sentic_values(['corona'], pos, language, measure, corpus, corpusID, gramRel, limit_snp_f, limit_snp_fof, harvestSelect)
########################################## Vader
def get_vader_srednjica_df(vader_values_df, node_importance_values_list, node_importance_measure):
# take list of concepts, values, and its node importance and return the middle value
# sentiWords_values_df je df sa lemama za koju se traži vrijednost,
# node importance_values_list je lista vrijednosti važnosti čvora,
# node_importanc_measure je naziv mjere značaja čvora u grafu
vader_values_df=vader_values_df.dropna()
vader =pd.DataFrame(index=[node_importance_measure])
vader['compound']=cgcn_functions.srednjica([float(x) for x in vader_values_df['compound']], node_importance_values_list)
return vader
#%%#%%##################################### SentiWords 1.1
# import sentiWords as dataframe
sentiWords = pd.read_csv('SentiWords_1.1.txt', header=25, sep='\t')
sentiWords['lempos']= sentiWords['lemma']+'-'+sentiWords['pos']
# keys: 'lemma', 'pos', 'polarity_score'
#%%
# sentiWords[sentiWords['lempos']=='karlovac-n']
#%%
def get_sentiWords_values_df(lempos_list, sentiWordsDict):
#for a list of lempos get values from sentiWords df
df= sentiWordsDict[sentiWordsDict['lempos'].isin(lempos_list)][['lempos', 'polarity_score']]
return df
# get_sentiWords_values_df(['fear-n', 'love-n', 'blood-n'], sentiWords)
def get_sentiWords_srednjica_df(sentiWords_values_df, node_importance_values_list, node_importance_measure):
# take list of concepts, values, and its node importance and return the middle value
# sentiWords_values_df je df sa lemama za koju se traži vrijednost,
# node importance_values_list je lista vrijednosti važnosti čvora,
# node_importanc_measure je naziv mjere značaja čvora u grafu
sentiWords_values_df=sentiWords_values_df.dropna()
sentiWords =pd.DataFrame(index=[node_importance_measure])
sentiWords['polarity_score']=cgcn_functions.srednjica([float(x) for x in sentiWords_values_df['polarity_score']], node_importance_values_list)
return sentiWords
#%%
def get_sentiWords_values_graph_df(sentiWords_values_df, graph_df ):
#get sentiWords values for a nodes in a graph
node_importance_measures=['pagerank', 'degree', 'weighted_degree', 'betweenness', 'sli']
graph_sentiWords_values = pd.DataFrame()
for node_importance_measure in node_importance_measures:
# get node importance from graph_DataFrame for all labels containing some value in sentiWords_concepts_df
node_importance_values_list = graph_df[graph_df["label"].isin([str(x) for x in sentiWords_values_df[sentiWords_values_df['polarity_score'].notna()]['lempos']])][node_importance_measure].astype('float').tolist()
graph_sentiWords_values= pd.concat([graph_sentiWords_values, get_sentiWords_srednjica_df(sentiWords_values_df, node_importance_values_list, node_importance_measure)])
return (graph_sentiWords_values)
def calculate_sentiWords_values(sentiWordsDict, listOfLemmas, pos, language, measure, corpus, corpusID, gramRel, limit_snp_f, limit_snp_fof, harvestSelect):
# for a lemma in df_senticnet get sourceValuePropagation 2- napraviti izračun po metodi - sourceValuePropagation: val(sVP)
# pos = -n, -v, -j
# start=time.time() # measure time
sentiWords_calculated_values=pd.DataFrame() # create a DataFrame
for lemma in listOfLemmas:
# lemma= lemma+pos # assign a pos to search in corpus lempos
try:
# f_snp_friends
f_snp_df = cgcn_functions.friends_df(language, lemma, pos, corpus, corpusID, gramRel, pos, limit_snp_f, measure, harvestSelect, 'undirected')
# fof_snp_friends
fof_snp_df= cgcn_functions.FoFData(language, f_snp_df, lemma, pos, corpus, corpusID, gramRel, pos, limit_snp_fof, measure, harvestSelect, 'undirected')
fof_snp_graph = cgcn_functions.FoF_graph(fof_snp_df, measure)
fof_snp_graph_df = cgcn_functions.df_from_graph(fof_snp_graph)
# get sentiWords values for lexical nodes in FoF
sentiWords_snp = get_sentiWords_values_df(fof_snp_graph.vs["label"], sentiWordsDict)
# get sentic value for a lemma in dictionary based on the 'sentiWords_snp '+lemma, sentiWords_snp
sentiWords_snp_value = get_sentiWords_values_graph_df(sentiWords_snp, fof_snp_graph_df)
sentiWords_snp_value['label'] = lemma
sentiWords_calculated_values = pd.concat([sentiWords_calculated_values, sentiWords_snp_value])
except:
pass
return sentiWords_calculated_values
#%%
sentiWordsNet = pd.read_csv('SentiWordNet.txt', sep='\t')
# sentiWordsNet['ID']=sentiWordsNet['ID'].astype(int)
# sentiWordsNet.head()
def get_word_sentiWordsNet_values_df(sentiWordsNet, lemposlist):
df= pd.DataFrame()
for lempos in lemposlist:
try:
lemma= lempos[0:-2]
pos= lempos[-1:]
lemmaSynTerm=lemma+"#"
query= sentiWordsNet[(sentiWordsNet['SynsetTerms'].str.match(lemmaSynTerm)) & (sentiWordsNet['POS']==pos)]
df= pd.concat([df,query])
except:
pass
return df
#%%
# def get_word_sentiWordsNet_values_df(sentiWordsNet, lemposlist):
# get_word_sentiWordsNet(sentiWordsNet, 'fear', 'n')