In [3]:
import pandas as pd
import spacy
import re
import datetime
from textblob import TextBlob

import matplotlib.pyplot as plt
import numpy as np
from ipywidgets import interact, IntSlider
plt.style.use("seaborn")
%matplotlib inline

def split_times(start, df):
    start = datetime.datetime.strptime(start, "%Y-%m-%d %H:%M:%S")
    dates_1 = [start + datetime.timedelta(minutes=15*h) for h in range(0,7)]
    dates_1 = [date.strftime("%Y-%m-%d %H:%M:%S") for date in dates_1]

    dates_2 = [start + datetime.timedelta(minutes=15*h) if h < 7 else start + datetime.timedelta(minutes=15*(h+0.5)) for h in range(1,8)]
    dates_2 = [date.strftime("%Y-%m-%d %H:%M:%S") for date in dates_2]

    min_15 = pd.DataFrame()
    min_30 = pd.DataFrame()
    min_45 = pd.DataFrame()
    min_break = pd.DataFrame()
    min_60 = pd.DataFrame()
    min_75 = pd.DataFrame()
    min_90 = pd.DataFrame()
    time_splits = [min_15, min_30, min_45, min_break, min_60, min_75, min_90]

    for i, (d1, d2) in enumerate(zip(dates_1, dates_2)):
        time_splits[i] = df[(df["tweet_date"] >= d1) & (df["tweet_date"] < d2)]

    return time_splits

def remove_emojis(string): # credit: https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return(emoji_pattern.sub(r"", string))

def format_raw_df(raw_df):
    raw_df["total"] = raw_df.sum(axis=1)
    raw_df.columns = ["null", "entity", "15", "30", "45", "break", "60", "75", "90", "total"]
    raw_df.drop(columns=["null"], inplace=True)
    remove_RT = (~raw_df["entity"].str.contains("RT")) #removes retweet users as entities
    remove_http = (~raw_df["entity"].str.contains("http")) #removes https links as entities
    raw_df = raw_df[remove_RT & remove_http]
    raw_df.fillna(0, inplace=True)
    raw_df.sort_values(by="total",ascending=False, inplace=True)
    return raw_df

def entities_processing(filename, start, chunk_size=200000, language="en"):
    
    df = pd.read_csv(filename, names=["tweet_id", "tweet_date", "user_location", "tweet"])
    df = df[~df["tweet_id"].duplicated()] #removes duplicates
    assert df.shape[1] == 4, "Max columns exceeded"
        
    time_splits = split_times(start, df)

    nlp = spacy.load('en')

    final_df = pd.DataFrame(columns=["entity", "frequency"])
    for i, time_split in enumerate(time_splits):    

        string = str([tweet for tweet in time_split["tweet"]])
        string = remove_emojis(string)

        print("String length: {}.".format(len(string)))
        chunks = [string[i:i + chunk_size] for i in range(0, len(string), chunk_size)]
        print("{}/7 - Splitting into {} chunks of {} size.".format(i+1, len(chunks), chunk_size))

        entities_agg = pd.DataFrame()
        for chunk in chunks:
            print('.', end='', flush=True)
            doc = nlp(chunk)

            dict_pers = {}
            for ent in doc.ents:
                if ent.label_ == "PERSON":# and "http" not in ent.text:
                    if ent.text in dict_pers.keys():
                        dict_pers[ent.text] += 1
                    else:
                        dict_pers[ent.text] = 1

            entities = pd.DataFrame.from_dict(dict_pers, orient="index")
            entities.reset_index(inplace=True)
            entities.columns = ["entity", "frequency"]
            entities_agg = pd.concat([entities_agg, entities])

        final_agg_df = entities_agg.groupby(by="entity", as_index=False).agg("sum").sort_values(by="frequency", ascending=False)
        final_df = pd.merge(final_df, final_agg_df, on="entity", how="outer")
    
    final_df = format_raw_df(final_df)
    
    return final_df

In [1]:
matches = {
    "FRAURU": ["tweets-FRA-vs-URU.csv", "2018-07-06 14:00:00"],
    "BELBRA": ["tweets-BEL-vs-BRA.csv", "2018-07-06 18:00:00"], 
    "ENGSWE": ["tweets-ENG-vs-SWE.csv", "2018-07-07 14:00:00"],
    "CRORUS": ["tweets-CRO-vs-RUS.csv","2018-07-07 18:00:00"],
    "FRABEL": ["tweets-FRA-vs-BEL.csv", "2018-07-10 18:00:00"],
    #"ENGCRO": ["tweets-ENG-vs-CRO.csv", "2018-07-11 18:00:00"]
    }

In [None]:
#df_entities = entities_processing(matches["ENGCRO"][0], matches["ENGCRO"][1])

In [None]:
#Functions to manually clean the aggregated data from each match.

def format_england_sweden(final_df):
    df = final_df[final_df["total"] > 300] .reset_index(drop=True)

    df.iloc[1,1:] = df.iloc[1,1:] + df.iloc[3,1:] + df.iloc[26,:] #merging Maguire rows
    df.iloc[5,1:] = df.iloc[5,1:] + df.iloc[7,1:] + df.iloc[22,:] #merging Kane rows
    df.iloc[2,1:] = df.iloc[2,1:] + df.iloc[25,1:] #merging Pickford rows
    df.iloc[24,1:] = df.iloc[24,1:] + df.iloc[27,1:] #merging Pickford rows

    df = df.drop([0,4,6,9,10,12,13,15,16,19,21,23,28,29,30,3,26,7,22,25,27]).sort_values(by="total", ascending=False).reset_index(drop=True)
    df["entity"] = ["Maguire","Kane","Pickford","Sterling","Young","Perisic","Alli","Gareth Southgate",
                "David Beckham","Lingard","Michael Owen"]
    return df

def format_croatia_russia(final_df):
    df = final_df[final_df["total"] > 300] .reset_index(drop=True)

    df = df.drop([0,1,4]).sort_values(by="total", ascending=False).reset_index(drop=True)

    df["entity"] = ["Putin", "Kramaric"]
    return df

def format_france_belgium(final_df):
    df = final_df[final_df["total"] > 300] .reset_index(drop=True)

    df.iloc[0,1:] = df.iloc[0,1:] + df.iloc[9,1:] + df.iloc[20,:] #merging Mbappe rows
    df.iloc[1,1:] = df.iloc[1,1:] + df.iloc[6,1:] #merging Hazard rows
    df.iloc[2,1:] = df.iloc[2,1:] + df.iloc[7,1:] #merging Lloris rows
    df.iloc[3,1:] = df.iloc[3,1:] + df.iloc[12,1:] #merging Henry rows

    df = df.drop([4,5,8,14,17,6,9,20,7,12]).sort_values(by="total", ascending=False).reset_index(drop=True)

    df["entity"] = ["Mbappe","Hazard","Lloris","Thierry Henry","Dembele","Roberto Martinez","Danny Murphy","Fellaini",
                    "De Bruyne","Umtiti","Griezmann"]
    return df

def format_belgium_brazil(final_df):
    df = final_df[final_df["total"] > 300] .reset_index(drop=True)
    
    df = df.append(df.iloc[3]).reset_index(drop=True) #splitting Marcelo & Willian
    df.iloc[3, 1:] = df.iloc[3, 1:]/2
    df.iloc[27, 1:] = df.iloc[27, 1:]/2
    df = df.append(df.iloc[5]).reset_index(drop=True) #splitting Witsel & Fellaini
    df.iloc[5, 1:] = df.iloc[5, 1:]/2
    df.iloc[28, 1:] = df.iloc[28, 1:]/2

    df.iloc[2,1:] = df.iloc[2,1:] + df.iloc[4,1:] + df.iloc[11,1:] + df.iloc[24,1:]  #merging Bruyne rows
    df.iloc[15,1:] = df.iloc[15,1:] + df.iloc[28,1:] #merging Fellaini rows
    df.iloc[9,1:] = df.iloc[9,1:] + df.iloc[16,1:] + df.iloc[19,1:] +df.iloc[18,1:] #merging Augusto rows
    df.iloc[10,1:] = df.iloc[10,1:] + df.iloc[21,1:] + df.iloc[22,1:] #merging Jesus rows

    df = df.drop([6,7,8,12,14,17,18,20,11,24,28,4,16,19,18,21,22,23,25]).sort_values(by="total", ascending=False).reset_index(drop=True)

    df["entity"] = ["Neymar","De Bruyne","Hazard","Renato Augusto","Gabriel Jesus","Fellaini","Marcelo",
                    "Willian","Witsel","Firmino","Roberto Martinez"]
    return df

def format_france_uruguay(final_df):
    df = final_df[final_df["total"] > 300] .reset_index(drop=True)
    
    df.iloc[4,1:] = df.iloc[4,1:] + df.iloc[9,1:] #merging Suarez rows
    df.iloc[5,1:] = df.iloc[5,1:] + df.iloc[7,1:] #merging Lloris rows
    df.iloc[8,1:] = df.iloc[8,1:] + df.iloc[15,1:] + df.iloc[16,1:] +df.iloc[18,1:] #merging Varane rows
    df.iloc[1,1:] = df.iloc[1,1:] + df.iloc[17,1:] #merging Mbappe rows
    df.iloc[14,1:] = df.iloc[14,1:] + df.iloc[19,1:] #merging Griezzman rows
    df.iloc[11,1:] = df.iloc[11,1:] + df.iloc[21,1:] #merging Muslera rows

    df = df.drop([1, 10, 13, 9, 7, 15, 16, 18, 17, 19, 21]).sort_values(by="total", ascending=False).reset_index(drop=True)

    df["entity"] = ["Mbappe","Neymar","Varane","Suarez","Lloris","Karius","Cavani","Muslera",
                    "Griezzman","De Gea","Pele"]
    return df

In [None]:
#df.to_csv("ENG-vs-SWE.csv")
#df.to_csv("BEL-vs-BRA.csv")
#df.to_csv("FRA-vs-BEL.csv")
#df.to_csv("CRO-vs-RUS.csv")
#df.to_csv("FRA-vs-URU.csv")

In [4]:
quarterfinals = [pd.read_csv(filename, index_col=0) for filename in
                 ["FRA-vs-URU.csv", "BEL-vs-BRA.csv", "ENG-vs-SWE.csv", "CRO-vs-RUS.csv"]]
semifinals = [pd.read_csv(filename, index_col=0) for filename in
                 ["FRA-vs-BEL.csv"]]

In [5]:
@interact(minute_split = IntSlider(min=0,max=7,step=1,value=0), match=["FRA vs URU", "BEL vs BRA", "ENG vs SWE", "CRO vs RUS"])
def display_entities(minute_split, match="FRA vs URU"):
    dict_key_col = {0:"total", 1:"15", 2:"30", 3:"45", 4:"break", 5:"60", 6:"75", 7:"90"}
    dict_key_title = {0:"Total Number of Mentions", 1:"Mentions at 0-15 mins", 2:"Mentions at 15-30 mins",
                      3:"Mentions at 30-45 mins", 4:"Mentions during break",5:"Mentions at 45-60 mins",
                      6:"Mentions at 60-75 mins", 7:"Mentions at 75-90 mins"}
    dict_key_df = {"FRA vs URU":quarterfinals[0], "BEL vs BRA":quarterfinals[1],
                  "ENG vs SWE": quarterfinals[2], "CRO vs RUS": quarterfinals[3]}
    
    y_pos = np.arange(dict_key_df[match][dict_key_col[minute_split]].shape[0])[::-1]
    labels = dict_key_df[match]["entity"]
    
    plt.figure(figsize=(8,6))
    plt.barh(y_pos, dict_key_df[match][dict_key_col[minute_split]])
    plt.yticks(y_pos, labels)
    plt.xlim(0,4000)
    plt.title("Quarterfinals - " + dict_key_title[minute_split])
    plt.show()
    
@interact(minute_split = IntSlider(min=0,max=7,step=1,value=0), match=["FRA vs BEL"])
def display_entities(minute_split, match="FRA vs BEL"):
    dict_key_col = {0:"total", 1:"15", 2:"30", 3:"45", 4:"break", 5:"60", 6:"75", 7:"90"}
    dict_key_title = {0:"Total Number of Mentions", 1:"Mentions at 0-15 mins", 2:"Mentions at 15-30 mins",
                      3:"Mentions at 30-45 mins", 4:"Mentions during break",5:"Mentions at 45-60 mins",
                      6:"Mentions at 60-75 mins", 7:"Mentions at 75-90 mins"}
    dict_key_df = {"FRA vs BEL":semifinals[0]}
    
    y_pos = np.arange(dict_key_df[match][dict_key_col[minute_split]].shape[0])[::-1]
    labels = dict_key_df[match]["entity"]
    
    plt.figure(figsize=(8,6))
    plt.barh(y_pos, dict_key_df[match][dict_key_col[minute_split]])
    plt.yticks(y_pos, labels)
    plt.xlim(0,6500)
    plt.title("Semifinals - " + dict_key_title[minute_split])
    plt.show()

In [6]:
def sentiment(names ,start_times):
    final_data = {}
    
    for name, start_time in zip(names, start_times):
        print('.', end='', flush=True)
        stop_time = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") + datetime.timedelta(hours=2)
        stop_time = stop_time.strftime("%Y-%m-%d %H:%M:%S")

        df = pd.read_csv(name, names=["tweet_id", "tweet_date", "user_location", "tweet"])
        df = df[(df["tweet_date"] > start_time) & (df["tweet_date"] < stop_time)]
        df[name] = [TextBlob(remove_emojis(tweet)).sentiment.polarity for tweet in df["tweet"]]

        groupped_df = df[["tweet_date", name]].groupby(by="tweet_date", as_index=False).agg("mean") #each second during the 2 hours after the game
        final_data[name] = groupped_df[name].tolist()
    return final_data

In [7]:
names_matches = [match[0] for match in matches.values()]
start_matches = [match[1] for match in matches.values()]

sentiment_data = sentiment(names_matches, start_matches)
names_sentiments = list(sentiment_data.keys())

.....

In [8]:
@interact(match=names_sentiments[:-1])
def display_entities(match=names_sentiments[0]):
    plt.figure(figsize=(12,6))
    plt.plot(sentiment_data[match])
    plt.ylim(-1,1)
    plt.show()
    
@interact(match=names_sentiments[-1])
def display_entities(match=names_sentiments[0]):
    plt.figure(figsize=(12,6))
    plt.plot(sentiment_data[match])
    plt.ylim(-1,1)
    plt.show()