In [1]:
%pylab inline
%matplotlib inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import os
os.chdir("/Users/Bya/git/predictEPL/MyFunctions/")
from dataIO import GetFilenames, csv_dic_df
from converter import toSeconds

import time
import pandas as pd
import csv
import random
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

### Preprocessing Function

In [3]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import replacers

def cleanHash(word):
    if word[0] == '#':
        return word[1::]
    elif word[0] == '@':
        return '@'
    elif word[0:4] == 'http':
        return 'http'
    else:
        return word

def preprocessing_tweet(tweet, debug = False):
    if debug:
        print("====================================")
        print("[Original Tweet]: \n\n %s \n\n" % tweet)


    # can't -> cannot, bya's -> bya is
    replacer = replacers.RegexpReplacer()
    tweet = replacer.replace(tweet)
    if debug:
        print("====================================")
        print("[Replaced Tweet]: \n\n %s \n\n" % tweet)


    # Tweet tokenizer and lower case
    words = TweetTokenizer().tokenize(tweet)
    words = [word.lower() for word in words]
    if debug:
        print("====================================")
        print("[Tokenized Tweet]: \n\n %s \n\n" % words)


    # defining stopwords
    english_stops = set(stopwords.words('english'))
    english_stops_added = english_stops | {'!', '.', ',', ':', ';', '#', '?', 'RT', '-', '@', 'rt'}
    words = [word for word in words if word not in english_stops_added]
    if debug:
        print("====================================")
        print("[Cleaned Stopwords Tweet]: \n\n %s \n\n" % words)



    # words = map(lambda word: cleanHash(word), words)
    words = [cleanHash(word) for word in words]
    if debug:
        print("====================================")
        print("[Clean hash Tweet]: \n\n %s \n\n" % words)



    # Stemmer
    stemmer = PorterStemmer()
    words_stemmed = list(map(lambda word: stemmer.stem(word), words))
    if debug:
        print("====================================")
        print("[Stemmed hash Tweet]: \n\n %s \n\n" % words_stemmed)



    # Cleaning not useful Tweets
    # ex: Watch Live Stream
    pattern1 = ["watch", "live", "stream"]
    good_tweet = True
    if pattern1[0] in words and pattern1[1] in words and pattern1[2] in words:
        good_tweet = False
        if debug:
            print("====================================")
            print("[Not good Tweet!]: \n\n detected: \n\n %s" % pattern1)
    
    return words, words_stemmed, good_tweet

### Read NRC Emotion-Lexicon and create dictionary

In [4]:
# read Emotion-Lexicon.txt
pathData = "/Users/Bya/Dropbox/Research/resources/NRC-Emotion-Lexicon-v0.92/"
os.chdir(pathData)
with open("Emotion-Lexicon.txt", 'r') as emoleRaw:
        emoleRaw = emoleRaw.readlines()


# create dic
dic_emolex = {}
dic_emolex_stemmed = {}

for line in emoleRaw:
    word, category, flag = line.split()
    flag = int(flag)
    
    if word not in dic_emolex:
        dic_emolex[word] = {}
        dic_emolex_stemmed[PorterStemmer().stem(word)] = {}
    
    dic_emolex[word][category] = flag
    dic_emolex_stemmed[PorterStemmer().stem(word)][category] = flag
    dic_emolex_stemmed[PorterStemmer().stem(word)]["_original_word"] = word

print("All Words: %s" % len(dic_emolex.keys()))

All Words: 14182


### Tweet's Emotion-Lexicon count

In [5]:
# sum of all words emotion-lexicon

def tweet_emolex(words, words_stemmed, good_tweet, debug = False):
    
    dic_tweet_emolex = {
        'anger': 0,
        'fear': 0,
        'disgust': 0,
        'sadness': 0,
        'surprise': 0,
        'trust': 0,
        'joy': 0,
        'anticipation': 0,
        'positive': 0,
        'negative': 0,
    }
    
    if not good_tweet:
        return dic_tweet_emolex


    for i in range(len(words)):
        # search word on non-stemmed dictionary
        if words[i] in dic_emolex:
            if debug:
                print("w->d")
                print(words[i])
                print(dic_emolex[words[i]])

            for key in dic_tweet_emolex.keys():
                dic_tweet_emolex[key] += dic_emolex[words[i]][key]
        
        # search stemmed word on non-stemmed dictionary
        elif words_stemmed[i] in dic_emolex:
            if debug:
                print("sw->d")
                print(words[i])
                print(words_stemmed[i])
                print(dic_emolex[words_stemmed[i]])

            for key in dic_tweet_emolex.keys():
                dic_tweet_emolex[key] += dic_emolex[words_stemmed[i]][key]

        # search stemmed word on stemmed dictionary
        elif words_stemmed[i] in dic_emolex_stemmed:
            if debug:
                print("sw->sd")
                print(words[i])
                print(words_stemmed[i])
                print(dic_emolex_stemmed[words_stemmed[i]])

            for key in dic_tweet_emolex.keys():
                dic_tweet_emolex[key] += dic_emolex_stemmed[words_stemmed[i]][key]

    return dic_tweet_emolex

### Plot Emotion-Lexion

In [6]:
def create_minute_dics_emolex(df, side, category, cumulative = False):
    minutes = range(120) 
    dic_minutes = {}

    for minute in minutes:    
        try:
            if cumulative:
                dic_minutes[minute] = sum(df[(df['side'] == side) & \
                                                    (df['ith_minute'] < minute + 1)][category])
            else:
                dic_minutes[minute] = sum(df[(df['side'] == side) & \
                                                    (df['ith_minute'] > minute) & \
                                                    (df['ith_minute'] < minute + 1)][category])
        except:
            dic_minutes[minute] = 0
        
    return (dic_minutes)


def plot_emolex(home_team, list_dic_minutes, colors, categorys, i_plot,limit=True):

    ax = plt.subplot(2,2,i_plot)
    if limit:
        ax.set_xlim([0, 60])
    else:
        ax.set_xlim([0, 120])

    for i in range(len(list_dic_minutes)):
        tweets = tuple(list(list_dic_minutes[i].values()))
        plot1 = plt.plot(tweets, label=categorys[i], alpha=0.5, color=colors[i])

    fp = FontProperties(fname=r'/Library/Fonts/ヒラギノ丸ゴ ProN W4.ttc')
    plt.xlabel("時間(分)", fontproperties=fp)
    plt.ylabel('感情ワード数', fontproperties=fp)
    plt.title('Emotion-Lexicon  (チーム名: %s)'  % home_team, fontproperties=fp)
    plt.legend()
    plt.tight_layout()


def plot_three(df, home_team, away_team):
    side = ['home', 'away', 'both']
    categorys_neg = ['anger', 'fear', 'disgust', 'sadness', 'surprise']
    colors_neg = ['red', 'darkorange', 'darksalmon', 'palegreen', 'darkgreen']
    categorys_pos = ['trust', 'joy', 'anticipation']
    colors_pos = ['skyblue', 'purple', 'blue']
    list_dic_minutes = []
    categorys_sent = ['positive', 'negative']
    colors_sent = ['green', 'red']

    plt.figure(figsize=(15, 10))

    list_dic_minutes = []
    for category in categorys_neg:
        list_dic_minutes.append(create_minute_dics_emolex(df, side[0], category))
    plot_emolex(home_team, list_dic_minutes, colors_neg, categorys_neg, 1, False)

    list_dic_minutes = []
    for category in categorys_pos:
        list_dic_minutes.append(create_minute_dics_emolex(df, side[0], category))
    plot_emolex(home_team, list_dic_minutes, colors_pos, categorys_pos, 3, False)

    
    list_dic_minutes = []
    for category in categorys_neg:
        list_dic_minutes.append(create_minute_dics_emolex(df, side[1], category))
    plot_emolex(away_team, list_dic_minutes, colors_neg, categorys_neg, 2, False)

    list_dic_minutes = []
    for category in categorys_pos:
        list_dic_minutes.append(create_minute_dics_emolex(df, side[1], category))
    plot_emolex(away_team, list_dic_minutes, colors_pos, categorys_pos, 4, False)

    plt.show()

### Reading Soccer Match datas and Create Emotion-Lexicon Dataframe

In [7]:
# read csv data as dataframe
def create_emolex_df(filePath):
    home_team = filePath.split("_")[0][5::]
    away_team = filePath.split("_")[2][0:-4]

    df = csv_dic_df(filePath)

    # adding minute column
    df['ith_minute'] = [float(ith_minute) for ith_minute in df['ith_minute']]

    # adding Emotion-Lexicon columns
    df['anger'] = 0
    df['fear'] = 0
    df['disgust'] = 0
    df['sadness'] = 0
    df['surprise'] = 0
    df['trust'] = 0
    df['joy'] = 0
    df['anticipation'] = 0
    df['positive'] = 0
    df['negative'] = 0


    # adding emotion-lexicon values to dataframe
    for i in range(len(df)):
        tweet = df.loc[i]['text']

        words, words_stemmed, good_tweet = preprocessing_tweet(tweet)
        dic_counted = tweet_emolex(words, words_stemmed, good_tweet)

        df.loc[i, 'anger'] = dic_counted['anger']
        df.loc[i, 'anticipation'] = dic_counted['anticipation']
        df.loc[i, 'disgust'] = dic_counted['disgust']
        df.loc[i, 'fear'] = dic_counted['fear']
        df.loc[i, 'joy'] = dic_counted['joy']
        df.loc[i, 'negative'] = dic_counted['negative']
        df.loc[i, 'positive'] = dic_counted['positive']
        df.loc[i, 'sadness'] = dic_counted['sadness']
        df.loc[i, 'surprise'] = dic_counted['surprise']
        df.loc[i, 'trust'] = dic_counted['trust']
    
    return df

### Counting Emotion and Sentiment

In [8]:
def calculate_df_emolex(df, side):
    dic = {
        'anger': [sum(df[df['side'] == side]['anger'])],
        'fear': [sum(df[df['side'] == side]['fear'])],
        'disgust': [sum(df[df['side'] == side]['disgust'])],
        'sadness': [sum(df[df['side'] == side]['sadness'])],
        'surprise': [sum(df[df['side'] == side]['surprise'])],
        'trust': [sum(df[df['side'] == side]['trust'])],
        'joy': [sum(df[df['side'] == side]['joy'])],
        'anticipation': [sum(df[df['side'] == side]['anticipation'])]
    }
    
    
    dic_sent = {
        'positive': [sum(df[df['side'] == side]['positive'])],
        'negative': [sum(df[df['side'] == side]['negative'])]
    }
    
    
    emolex_sum = float(sum(list(dic.values())))
    emolex_sum_sent = float(sum(list(dic_sent.values())))

    for key in dic.keys():
        dic[key].append(dic[key][0] / emolex_sum)
    
    for key in dic_sent.keys():
        dic_sent[key].append(dic_sent[key][0] / emolex_sum_sent)
    
    return dic, dic_sent

In [78]:
# read Emotion-Lexicon.txt
pathData = "/Users/Bya/Dropbox/Research/resources/NRC-Hashtag-Sentiment-Lexicon-v0.1//"
os.chdir(pathData)
with open("sentimenthashtags.txt", 'r') as sentRaw:
        sentRaw = sentRaw.readlines()


# create dic
dic_sent = {}

for line in sentRaw:
    word, category = line.split()    
    dic_sent[word] = category
    

print("All Words: %s" % len(dic_sent.keys()))

All Words: 77


In [183]:
def sented_emoted_tags(filename):
    df = csv_dic_df(filename)

    tags = list(df.tags)

    all_tags = []
    for tag in tags:
        tag = tag.split(',')
        for t in tag:
            all_tags.append(t)

    print("All tags length:", len(all_tags))



    import nltk
    fd = nltk.FreqDist(all_tags)
    print("Unique tag length:", len(fd.keys()), '\n\n\n')

    # generate a chart of the 50 most frequent words
    # fd.plot(50,cumulative=False)
    # fd.plot(10,cumulative=False)





    tag_keys = list(fd.keys())
    emo_keys = list(dic_emolex.keys())
    tag_emo = []
    for k in tag_keys:
        if k in emos and sum(list(dic_emolex[k].values())):
            if k not in tag_emo:
                tag_emo.append(k)
                
    print("Tagged Emotion words length:", len(tag_emo))
    print(tag_emo[:10], '\n\n\n')



    sent_keys = list(dic_sent.keys())
    tag_sent = []
    for k in tag_keys:
        if k in sent_keys:
            if k not in tag_sent:
                tag_sent.append(k)

    print("Tagged Sentiment words length:", len(tag_sent))
    print(tag_sent[:10], '\n\n\n')



    for i in range(len(df)):
        tags = df.loc[i][9].split(',')
        for sent in tag_sent:
            if sent in tags:
                print('[', dic_sent[sent], ']' , df.loc[i][10], '\n')
                if df.loc[i][10] not in dic_supervised_sent[dic_sent[sent]]:
                    dic_supervised_sent[dic_sent[sent]].append(df.loc[i][10])
                    
        for sent in tag_emo:
            if sent in tags:
                for e_k in dic_emolex[sent].keys():
                    if dic_emolex[sent][e_k]:
#                         print('[', e_k, ']' , df.loc[i][10], '\n')
                        if df.loc[i][10] not in dic_supervised_emo[e_k]:
                            dic_supervised_emo[e_k].append(df.loc[i][10])

### Ploting and Calculating GW10

In [184]:
# choosing data
os.chdir('/Users/Bya/Dropbox/Research/datas/Results/NB_Short_Long/')

dic_supervised_sent = {'positive': [], 'negative': []}
dic_supervised_emo = {
    'anger': [],
    'fear': [],
    'disgust': [],
    'sadness': [],
    'surprise': [],
    'trust': [],
    'joy': [],
    'anticipation': [],
    'positive': [],
    'negative': [],
}
game_weeks = range(5, 11)
# game_weeks = [10]

for week in game_weeks:
    filenames = GetFilenames('GW' + str(week))
    for filename in filenames:
#         filename = filenames[0]
        home_team = filename.split("_")[0][5::]
        away_team = filename.split("_")[2][0:-4]

        print('===============================')
        print(home_team, ' vs ', away_team)

        start = time.time()

        sented_emoted_tags(filename)


        # time
        print("\n[Time]: %s\n\n\n" % (time.time() - start))

rsenal  vs  Stoke
All tags length: 26163
Unique tag length: 594 



Tagged Emotion words length: 29
['football', 'bonus', 'crystal', 'boxing', 'scandal', 'hero', 'sweet', 'explain', 'breakfast', 'joke'] 



Tagged Sentiment words length: 2
['perfection', 'superb'] 



[ positive ] Tackle --&gt; Pass --&gt; Touch to bring it down #Superb #Arsenal 

[ positive ] That Ozil pass for Walcott's goal ❤️ 👌🏽 #Magic #Perfection #arsenal 


[Time]: 2.227724075317383



rystal  vs  City
All tags length: 15383
Unique tag length: 308 



Tagged Emotion words length: 11
['football', 'blues', 'crystal', 'hazard', 'herbal', 'perfection', 'bury', 'disgrace', 'dirty', 'disgraceful'] 



Tagged Sentiment words length: 1
['perfection'] 



[ positive ] Watching #mcfc
the quest for #perfection continues. 

[ positive ] Watching #mcfc
The quest for #perfection continues. 


[Time]: 1.7024610042572021



verton  vs  Chelsea
All tags length: 67991
Unique tag length: 950 



Tagged Emotion words length: 60
['di

In [185]:
neg_len = len(dic_supervised_sent['negative'])
pos_len = len(dic_supervised_sent['positive'])

print("\n[Using Sentiment Hashtags]:\n")
print("\tPos: %s \n\tNeg: %s\n" %(pos_len, neg_len))



anger_len = len(dic_supervised_emo['anger'])
fear_len = len(dic_supervised_emo['fear'])
disgust_len = len(dic_supervised_emo['disgust'])
sadness_len = len(dic_supervised_emo['sadness'])
surprise_len = len(dic_supervised_emo['surprise'])

trust_len = len(dic_supervised_emo['trust'])
joy_len = len(dic_supervised_emo['joy'])
anticipation_len = len(dic_supervised_emo['anticipation'])

positive_len = len(dic_supervised_emo['positive'])
negative_len = len(dic_supervised_emo['negative'])

print("\n[Using Emolex Hashtags]:\n")

print("\t", "Anger: ", anger_len)
print("\t", "Fear: ", fear_len)
print("\t", "Sadness: ", sadness_len)
print("\t", "Disgust: ", disgust_len)
print("\t", "Surprise: ", surprise_len)

print("\n\t", "Trust: ", positive_len)
print("\t", "Joy: ", positive_len)
print("\t", "Anticipation: ", anticipation_len)

print("\n\t", "Positive: ", positive_len)
print("\t", "Negative: ", negative_len)


[Using Sentiment Hashtags]:

	Pos: 67 
	Neg: 151


[Using Emolex Hashtags]:

	 Anger:  1395
	 Fear:  860
	 Sadness:  734
	 Disgust:  572
	 Surprise:  381

	 Trust:  3214
	 Joy:  3214
	 Anticipation:  2023

	 Positive:  3214
	 Negative:  1801


In [186]:
dic_supervised_emo['trust']

['Arsenal should be one nil up. Butland with a world class save to deny Alexis. #Arsenal #save',
 'RT @fantasyiteam: Arsenal should be one nil up. Butland with a world class save to deny Alexis. #Arsenal #save',
 'How is #afc not already leading? #explain #AFCvSCFC',
 'Yeah yeah Theo scored and Ozil got the assist but what a tackle by Coquelin to win the ball. #hero #afc',
 '@AndyRunsBadly Heck yeah! #TopOfTheTableAndNotJustBecauseItsInAlphabeticalOrder #AFC #winning',
 'That tackle from Coq. That pass from Ozil. That finish from Theo. Quality all around. #sweet #arsenal #afc',
 "That Ozil pass for Walcott's goal ❤️ 👌\U0001f3fd #Magic #Perfection #arsenal",
 'One nil to the #Arsenal #Finally #ARSSTK',
 '11 goals for Theo in his last 11 League starts... \n\n#effective #arsenal',
 'RT @bexington49: 11 goals for Theo in his last 11 League starts... \n\n#effective #arsenal',
 'perfection from #Ozil wow wonderfull Assist  #respect #Arsenal vs #Stoke',
 'RT @TheAnalysisVids: Francis Coquelin

In [187]:
dic_supervised_emo['positive']

['Wow they still play old music like #fatboyslim at international soccer matches. #football #soccer #arsenal',
 'Arsenal should be one nil up. Butland with a world class save to deny Alexis. #Arsenal #save',
 'RT @fantasyiteam: Arsenal should be one nil up. Butland with a world class save to deny Alexis. #Arsenal #save',
 'Anyone got a link to the game?? #Arsenal #Football #LiveLink',
 'Check out this #job: #CAMHs #CBT Therapist - Stoke at Labmed Recruitment in #Stoke-on-Trent #jobs http://t.co/HgyOy9UOTi',
 '.#Football is back this weekend... Lets see your #FootballSelfie #FootballFitty #selfie #sexyselfie #arsenal #afc http://t.co/XSLvUwytEh',
 'Beer and football make an amazing combination. Time for some #trappistwestvleteren!\n\n#arsenal #beer… https://t.co/8q4Qo5yRGr',
 'Why does Theo start instead of Giroud ?? #ArsvSto #Arsenal #AFC #BPL #Football #Stoke',
 '#Follow #FollowBack #Footy #Football #Keyrings #Sports #AVFC #MUFC #AFC #CFC #MCFC #Chelsea #Villa #ManU #ManCity Pls RT!!'

In [188]:
t = 'Everton Vs Chelsea\n\nLive Stream: http://t.co/o0cGxU5mU0\n\nftblsk4 #EFC #COYB #EFCvCHE #CFC #FF #COYBG #football #news #EFCvCFC #CHN'

In [191]:
t not in dic_supervised_emo['positive']

False

In [198]:
i = 0
for key in dic_sent.keys():
    if dic_sent[key] == 'positive':
        print(key)
        i += 1
print(i)

perfect
wonderful
beauty
excellent
fine
great
sensational
excellence
elegant
perfection
positive
classy
ideal
magnificent
super
superb
good
beautiful
nice
splendid
amazing
dazzling
marvelous
fabulous
desirable
elegance
26


In [197]:
i = 0
for key in dic_sent.keys():
    if dic_sent[key] == 'negative':
        print(key)
        i += 1
print(i)

exceptional
tacky
unsatisfactory
ill
abominable
shoddy
poor
egregious
blemish
sad
infernal
hopeless
exquisite
foul
negative
shameful
terrible
sinful
pathetic
woeful
vile
detestable
deplorable
heavenly
grievous
sinister
inferior
dire
rotten
bad
botch
worthless
detest
paltry
unworthy
filthy
wretched
awful
terrific
unacceptable
squalid
sorry
lousy
sleazy
seamy
abysmal
crummy
despicable
dreadful
bungle
undesirable
51
