In [1]:
import nltk
from nltk.corpus import stopwords
from math import exp, expm1, log, log10, cos, sin, sqrt
import math
import numpy as np
from scipy.optimize import minimize
from scipy.spatial.distance import cdist
import pandas as pd
import re, string, unicodedata
from tqdm import tqdm
lexicon_pd = pd.read_csv("SentiWordNet3.txt", sep=";")

lexicon_comb = {}
lexicon_pos = {}
lexicon_base = {}

for k,row in tqdm(lexicon_pd.iterrows()):
    score = row["PosScore"] - row["NegScore"]
    words = row["SynsetTerms"].split(" ")
    for word in words:
        word_base = word.split("#")[0] + "#" + row["POS"]
        if word_base in lexicon_comb:
            lexicon_comb[word_base] += [score]
        else:
            lexicon_comb[word_base] = [score]
for (word, scores) in lexicon_comb.items():
    score = 0
    sum_weight = 0
    for i in range(len(scores)):
        score += 1 / (i + 1) * scores[i]
        sum_weight += 1 / (i+1)
    lexicon_pos[word] = score / sum_weight

for (word, value) in lexicon_pos.items():
    word = word.split("#")[0]
    if word in lexicon_base:
        lexicon_base[word] += value
    else:
        lexicon_base[word] = value
        
textdf = pd.read_csv("..\\GoldenStandard\\Golden_Standard.csv", sep=";", parse_dates=[1])

117659it [00:12, 9435.67it/s]


In [2]:
## AdaUSA Sliding Window

def AdaUSA(data, lexicon, learning="N", forgetting="N", alpha=0.05, theta=0.5, window_size=100, debug=False):
    possible_behavior = [
        "N", # No updates
        "S", # Sliding updates
        "T"  # Tumbling updates
    ]
    if not (learning in possible_behavior and forgetting in possible_behavior):
        print("parameters learning and/or forgetting not set correctly! Make sure they both are in [N, S, T]")
        return
    
    def preprocess(sentence):
        new_words = []
        for word in sentence.split(" "):
            new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')

            # Remove punctuation
            new_word = re.sub(r'([^a-zA-Z\s])', '', new_word).lower()

            # Remove stopwords
            if new_word in stopwords.words('english'):
                new_word = ''

            if new_word != '':
                new_words.append(new_word)

        return new_words
    
    def get_contextual_sentiment(text, lexicon):
    
        def get_TDOC(lines, key, lexicon):
            freq = {'Init': 0}              #Number of times context term occurs with key
            freq.clear()
            prohib = stopwords.words("english")
            for line in lines:
                words = line.split(" ")
                if key in words:
                    for context in words:
                        if context != key and context:
                            freq.setdefault(context, 0)
                            freq[context] = freq.get(context) + 1

            N = 0                           #Total Number of terms in Document
            for line in lines:
                words = line.split(" ")
                N += len(words)

            Nci = {'Init': 0}               #Total terms that occur with context term
            Nci.clear()
            for context in freq.keys():
                for line in lines:
                    words = line.split(" ")
                    if context in words:
                        Nci.setdefault(context, 0)
                        Nci[context] += len(words)

            radii = {'Init': 0}             #Get Radius of context term with TDOC formula
            radii.clear()
            for term in freq.keys():
                 radii[term] = freq[term]*(log(N/Nci[term]))
            return radii                    #Returns entire set of context terms related to key

        def geometric_median(points, options={}):

            points = np.asarray(points)

            # objective function
            def aggregate_distance(x):
                return cdist([x], points).sum()

            # initial guess: centroid
            centroid = points.mean(axis=0)

            optimize_result = minimize(aggregate_distance, centroid, method='COBYLA')

            return optimize_result.x  


        word_list = []

        for line in body:
            for word in line.split(" "):
                if word not in word_list:
                    word_list.append(word)

        context = {}
        for word in word_list:
            if word not in lexicon:
                continue
            tdoc = get_TDOC(body, word, lexicon)

            vectors = []

            for k, v in tdoc.items():
                if k in lexicon:
                    vectors.append([v * cos(lexicon[k] * math.pi), v * sin(lexicon[k] * math.pi)])

            if len(vectors) > 0:
                context[word] = geometric_median(vectors)[1]
            else:
                context[word] = 0
        return context

    def update_rules(prior, context, theta, alpha, in_lexicon=True):
        # Expanding Rule
        if not in_lexicon:
            return context

        # Update Rules
        # Same orientation 
        elif abs(context) + abs(prior) == abs(context + prior):
            if abs(context) > abs(prior) and abs(context) > theta:
                return prior + alpha if prior > 0 else prior - alpha
        # Different orientation
        elif context * prior < 0:
            if abs(context) > theta and prior <= theta:
                return alpha if prior < 0 else -alpha
            elif abs(context) > theta and abs(prior) > theta:
                return prior - alpha if prior > 0 else prior + alpha

        return prior

    def calculate_sentiment(body_array, lexicon):
        sent_score = 0
        for sentence in body_array:
            for word in sentence.split(" "):
                if word in lexicon:
                    sent_score += lexicon[word]
        return -1 if sent_score < 0 else 1
    
    i = 0

    window = []
    update_memory = {}
    
    if forgetting == "T":
        base_lexicon = dict(lexicon)
    
    for _, article in data.iterrows():
        i += 1
        body = []
        text = article["body"]
        
        if forgetting == "S":
            outdated = []
            for (date, information) in update_memory.items():
                if (article["date"] - date).days > 365:
                    old_rules = update_memory[date]
                    outdated.append(date)
                    
                    if debug:
                        print(";;; Forgetting rule:")
                        print(";;; ", str(old_rules))
                    for old_rule in old_rules:
                        lexicon[old_rule[0]] -= old_rule[1]

            for date in outdated:
                update_memory.pop(date)

        # Pre-process article
        for sent in text.split(". "):
            body.append(" ".join(preprocess(sent)))

        pred = calculate_sentiment(body, lexicon)
        true = article["sentiment"]

        if pred == -1:
            if true == -1:
                out = "TN"
            else:
                out = "FN"
        else:
            if true == -1:
                out = "FP"
            else:
                out = "TP"

        print("%i;%s" % (i, out))

        body_concat = ""
        for line in body:
            body_concat += line
        window.append(line)

        if learning in ["S", "T"]:
            if (learning == "S" and i < window_size) or (learning == "T" and i % window_size):
                continue
            
            if forgetting == "T":
                lexicon = dict(base_lexicon)
            
            if debug:
                lexicon_old = dict(lexicon)
                
            # Per word in the collection, compute the contextual sentiment
            contexts = get_contextual_sentiment(window, lexicon)

            word_updates = []

            # Update the sentiment score of a word according to its context and the updating rules
            for word, context in contexts.items():
                prior = lexicon[word]
                context /= 5

                updated_prior = update_rules(prior, context, theta, alpha)
                if lexicon[word] != updated_prior:
                    word_updates.append((word, updated_prior - lexicon[word]))
                    lexicon[word] = updated_prior

            if forgetting == "S" and len(word_updates) > 0:
                if article["date"] in update_memory:
                    update_memory[article["date"]] += word_updates
                else:
                    update_memory[article["date"]] = word_updates

            if debug:
                # Check which words changed
                print(";;; Update after iteration %s" % i)
                print(";;; ", str(update_memory))
                for k, v in lexicon_old.items():
                    if lexicon[k] != v:
                        print(";;; %s: %s" % (k, lexicon[k]))
    
            if learning == "S":
                window = window[1:100]
            else:
                window = []

In [4]:
lexicon = dict(lexicon_base)

AdaUSA(textdf, lexicon, learning="T", forgetting="N", alpha=0.05, theta=0.5, window_size=100, debug=False)

1;TN
2;FP
3;FP
4;FP
5;FP
6;TP
7;TP
8;FP
9;TP
10;TP
11;FP
12;FP
13;TN
14;TP
15;FP
16;TP
17;TP
18;FP
19;FN
20;FP
21;FP
22;FP
23;TN
24;TN
25;TP
26;FP
27;FP
28;FP
29;FP
30;FP
31;TP
32;TP
33;TP
34;FP
35;FP
36;TP
37;FP
38;FP
39;TP
40;FP
41;TN
42;FP
43;FP
44;FP
45;FP
46;FP
47;TP
48;TP
49;TP
50;TP
51;FP
52;FP
53;FP
54;FP
55;FP
56;FP
57;TP
58;TP
59;FP
60;TP
61;TN
62;FP
63;FP
64;FP
65;TN
66;FP
67;FP
68;TN
69;FP
70;FP
71;TP
72;FN
73;FP
74;FP
75;FP
76;TP
77;FP
78;TP
79;TN
80;TP
81;FP
82;FP
83;FP
84;FP
85;FP
86;FP
87;TN
88;TN
89;FP
90;FP
91;TN
92;TP
93;FP
94;FP
95;TP
96;TN
97;FP
98;FP
99;FP
100;TP
101;TP
102;TP
103;TP
104;TP
105;FP
106;FP
107;TP
108;FP
109;FP
110;TP
111;FP
112;FP
113;FP
114;TN
115;TN
116;FP
117;FP
118;FN
119;FN
120;FP
121;TP
122;FP
123;TP
124;TP
125;TP
126;FP
127;TP
128;TP
129;TP
130;FP
131;TP
132;TP
133;TP
134;TP
135;FP
136;TN
137;TP
138;TP
139;TN
140;FP
141;TP
142;TP
143;FP
144;TP
145;TN
146;TP
147;TP
148;FP
149;FP
150;TP
151;TN
152;TN
153;TN
154;TP
155;FP
156;TP
157;FP
158;FN
15

2187;FP
2188;FP
2189;FN
2190;TP
2191;TP
2192;FP
2193;TP
2194;TP
2195;TN
2196;TP
2197;TP
2198;FP
2199;TP
2200;FP
2201;TP
2202;TP
2203;FP
2204;TP
2205;TP
2206;TP
2207;TP
2208;FP
2209;TP
2210;TP
2211;TP
2212;TP
2213;TP
2214;TP
2215;TP
2216;TP
2217;FP
2218;TP
2219;TN
2220;TP
2221;TP
2222;TP
2223;TP
2224;TP
2225;TP
2226;TP
2227;FP
2228;TP
2229;TP
2230;FP
2231;FP
2232;TP
2233;FP
2234;FP
2235;TP
2236;TP
2237;FP
2238;TP
2239;TP
2240;TP
2241;TP
2242;TP
2243;TP
2244;TN
2245;TP
2246;TP
2247;FP
2248;TP
2249;TP
2250;TP
2251;TP
2252;FP
2253;TP
2254;TP
2255;TP
2256;TP
2257;TP
2258;TP
2259;FP
2260;TP
2261;TP
2262;TP
2263;TP
2264;TP
2265;TP
2266;TP
2267;FP
2268;TP
2269;FP
2270;TP
2271;TN
2272;TN
2273;TP
2274;TP
2275;TP
2276;TP
2277;TP
2278;TP
2279;TP
2280;TP
2281;FP
2282;TN
2283;TP
2284;TP
2285;TP
2286;TP
2287;FN
2288;TP
2289;TP
2290;TN
2291;TP
2292;TP
2293;TP
2294;TP
2295;FP
2296;TP
2297;TP
2298;TP
2299;TP
2300;TP
2301;TP
2302;FP
2303;TP
2304;FP
2305;TP
2306;TP
2307;FP
2308;TN
2309;TP
2310;TP
2311;FP


4235;FP
4236;TP
4237;TP
4238;FP
4239;FP
4240;TP
4241;FP
4242;FP
4243;FP
4244;FP
4245;FP
4246;FP
4247;TN
4248;FP
4249;FP
4250;FP
4251;TP
4252;FP
4253;FP
4254;FP
4255;TP
4256;FP
4257;FP
4258;TP
4259;FP
4260;FP
4261;FP
4262;TP
4263;FP
4264;FP
4265;FP
4266;TN
4267;TN
4268;FP
4269;TN
4270;FP
4271;TP
4272;FP
4273;FP
4274;FP
4275;FP
4276;TP
4277;FP
4278;TN
4279;FP
4280;FP
4281;TP
4282;FP
4283;TP
4284;FP
4285;TN
4286;FP
4287;TP
4288;FP
4289;TP
4290;FP
4291;FP
4292;FP
4293;FP
4294;FP
4295;TP
4296;FP
4297;FP
4298;TP
4299;FP
4300;FP
4301;TP
4302;TP
4303;TP
4304;TP
4305;TN
4306;FP
4307;FP
4308;TP
4309;FP
4310;FP
4311;FP
4312;TN
4313;TP
4314;FP
4315;FP
4316;FP
4317;TP
4318;FP
4319;FP
4320;FP
4321;FP
4322;FP
4323;TP
4324;FP
4325;TP
4326;FP
4327;TN
4328;TP
4329;FP
4330;TP
4331;FP
4332;TP
4333;TN
4334;FP
4335;FP
4336;TP
4337;FP
4338;FP
4339;TP
4340;FP
4341;FP
4342;FP
4343;FP
4344;FP
4345;FP
4346;FP
4347;TN
4348;TP
4349;TN
4350;FP
4351;FP
4352;TP
4353;FP
4354;FP
4355;FP
4356;FP
4357;TN
4358;FP
4359;FP
