# Importing the libraries

In [53]:
import pandas as pd
import numpy as np

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.translate import bleu
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

import openai

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Import the dataset

In [54]:
comm_df = pd.read_csv('../../data/commentory_matchid.csv')
comm_df = comm_df[comm_df['match_id'] != 95]
comm_df.head()

Unnamed: 0,time,comment,event,event_player,event_team,comment_desc,home_team,home_team_abbr,away_team,away_team_abbr,full_time_score,match,date,link,match_id
0,,thanks for joining our commentary this evenin...,,,,full time summary,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
1,,barcelona are next in action at home in lalig...,,,,full time summary,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
2,,bayern munich have eased past barcelona in th...,,,,full time summary,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
3,90 + 2,full-time: barcelona 0-3 bayern munich,,,,timer,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
4,90,there will be two minutes of added time.,,,,timer,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0


# Use sliding window technique with 15 mins buffer for the match live ticker

In [55]:
def window_df(df, start_timer, end_timer):
    # Convert time column to str
    df['time'] = df['time'].astype(str)
    comments = []
    for i in range(df.shape[0]):
        time = df['time'][i]
        if time != 'nan':
            if '+' in time:
                time = time[:2]
            if int(time) >= start_timer and int(time) < end_timer:
                if df['comment_desc'][i] == 'timer':
                    comments.append(df['comment'][i])
    return " ".join(comments)

In [56]:
def create_window(df):
    all_comm = []
    match_ids = df['match_id'].unique()
    for id in match_ids:
        # Filter the dataframe w.r.t match_id
        match_df = df[df['match_id'] == id]
        match_df.reset_index(inplace = True, drop = True)

        # Divide the dataframe into 6 separate dfs, each corresponding to 15 minutes of the match.
        comm_15 = window_df(match_df, 0, 16)
        comm_30 = window_df(match_df, 16, 31)
        comm_45 = window_df(match_df, 31, 46)
        comm_60 = window_df(match_df, 46, 61)
        comm_75 = window_df(match_df, 61, 76)
        comm_90 = window_df(match_df, 76, 91)

        # Append the respective live tickers to a list
        all_comm.append([comm_15, comm_30, comm_45, comm_60, comm_75, comm_90])
    
    return all_comm

In [57]:
commentaries = create_window(comm_df)

In [58]:
# Will be used later for summaries
timer_list= ['[1-15]', '[16-30]', '[31-45]', '[46-60]', '[61-75]', '[76-90]']

# Extractive text summarization

## Using SpaCy to summarize text

In [59]:
nlp = spacy.load("en_core_web_sm")

def get_spacy_summary(text):
    doc = nlp(text)

    keyword = []
    stopwords = list(STOP_WORDS)
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
    for token in doc:
        if (token.text in stopwords or token.text in punctuation):
            continue
        if token.pos_ in pos_tag:
            keyword.append(token.text)

    freq_word = Counter(keyword)
    max_freq = Counter(keyword).most_common(1)[0][1]
    for word in freq_word.keys():
        freq_word[word] = (freq_word[word]/max_freq)

    sent_strenght = {}
    for sent in doc.sents:
        for word in sent:
            if word.text in freq_word.keys():
                if sent in sent_strenght.keys():
                    sent_strenght[sent] += freq_word[word.text]
                else:
                    sent_strenght[sent] = freq_word[word.text]

    summarized_sentences = nlargest(3, sent_strenght, key=sent_strenght.get)
    final_sentences = [w.text for w in summarized_sentences]
    return " ".join(final_sentences)

In [60]:
def get_all_comm_spacy(comms):
    all_comm_spacy = []
    for commentary in comms:
        spacy_window_comm = []
        for comment in commentary:
            spacy_window_comm.append(get_spacy_summary(comment))
        all_comm_spacy.append(" ".join(spacy_window_comm))
    return all_comm_spacy

In [61]:
spacy_comm = get_all_comm_spacy(commentaries)

## Using NLTK for text summarization

In [62]:
def remove_punct(text):
    """ A method to remove punctuations from text """
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text) #removes numbers from text
    return text

In [63]:
def tokenization(text):
    """ A method to tokenize text data """
    text = re.split('\W+', text) #splitting each sentence/ tweet into its individual words
    return text

In [64]:
def summarize_text_nltk(text, num_sentences=3):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize the sentences into words and remove stopwords
    
    text_punct_removed = remove_punct(text)
    words = tokenization(text_punct_removed.lower())
    
    # words = word_tokenize(text)

    # remove stopwords
    stop_words =  set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    # Apply stemming to the filtered words
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    # Lemmatize words
    word_net_lemma = nltk.WordNetLemmatizer()
    word_lemma = [word_net_lemma.lemmatize(word) for word in stemmed_words]
    
    # Calculate word frequency and sentence scores
    word_freq = nltk.FreqDist(stemmed_words)
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_freq:
                if len(sentence.split()) < 30:
                    if i not in sentence_scores:
                        sentence_scores[i] = word_freq[word]
                    else:
                        sentence_scores[i] += word_freq[word]
    
    # Select the top sentences based on their scores
    summary_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    summary = [sentences[i] for i in sorted(summary_sentences)]
    return " ".join(summary)

In [65]:
def get_all_comm_nltk(comms):
    all_comm_nltk = []
    for commentary in comms:
        nltk_window_comm = []
        for comment in commentary:
            nltk_window_comm.append(summarize_text_nltk(comment))
        all_comm_nltk.append(" ".join(nltk_window_comm))
    return all_comm_nltk

In [66]:
nltk_comm = get_all_comm_nltk(commentaries)

# Get full time summary from comment description

In [67]:
def get_full_match_summ(df):
    all_ft_comm = []
    match_ids = df['match_id'].unique()
    for id in match_ids:
        # Filter the dataframe w.r.t match_id
        match_df = df[df['match_id'] == id]
        all_ft_comm.append(" ".join(match_df[match_df['comment_desc'] == 'full time summary']['comment']))
    
    return all_ft_comm

In [68]:
all_ft_comm = get_full_match_summ(comm_df)

# Using ROGUE Score as evaluation metric

In [69]:
# !pip install rouge

# Evaluating SpaCy summaries

In [70]:
def spacy_rouge_score(spacy_summ, ft_summ):
    rouge = Rouge()
    all_rouge_spacy = []
    for i in range(len(spacy_summ)):
        all_rouge_spacy.append(rouge.get_scores(ft_summ[i], spacy_summ[i]))
    
    rogue1_spacy = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])
    rogue2_spacy = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])
    rogue_lcs_spacy = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])

    for score in all_rouge_spacy:
        rouge1 = score[0]['rouge-1']
        rouge2 = score[0]['rouge-2']
        rouge_l = score[0]['rouge-l']

        rogue1_spacy = rogue1_spacy.append({"recall": rouge1['r'], "precision": rouge1['p'], "f1": rouge1['f'], 
                                    "module": 'spacy'}, ignore_index=True)
        rogue2_spacy = rogue2_spacy.append({"recall": rouge2['r'], "precision": rouge2['p'], "f1": rouge2['f'], 
                                    "module": 'spacy'}, ignore_index=True)
        rogue_lcs_spacy = rogue_lcs_spacy.append({"recall": rouge_l['r'], "precision": rouge_l['p'], "f1": rouge_l['f'], 
                                    "module": 'spacy'}, ignore_index=True)

    rogue1_spacy = rogue1_spacy.sort_values(by='f1', ascending = False)
    rogue2_spacy = rogue2_spacy.sort_values(by='f1', ascending = False)
    rogue_lcs_spacy = rogue_lcs_spacy.sort_values(by='f1', ascending = False)

    return rogue1_spacy, rogue2_spacy, rogue_lcs_spacy


In [83]:
rogue1_spacy, rogue2_spacy, rogue_lcs_spacy = spacy_rouge_score(spacy_comm, all_ft_comm)
rogue1_spacy

Unnamed: 0,recall,precision,f1,module
96,0.257463,0.46,0.330144,spacy
62,0.244186,0.480916,0.323907,spacy
63,0.233962,0.504065,0.319588,spacy
42,0.235556,0.477477,0.315476,spacy
72,0.236641,0.442857,0.308458,spacy
...,...,...,...,...
15,0.117647,0.229167,0.155477,spacy
99,0.110577,0.261364,0.155405,spacy
95,0.104803,0.263736,0.15,spacy
77,0.094203,0.285714,0.141689,spacy


In [72]:
rogue2_spacy

Unnamed: 0,recall,precision,f1,module
63,0.062361,0.163743,0.090323,spacy
5,0.058172,0.134615,0.081238,spacy
96,0.058952,0.118421,0.078717,spacy
91,0.048387,0.161074,0.074419,spacy
74,0.052632,0.117284,0.072658,spacy
...,...,...,...,...
82,0.01005,0.038095,0.015905,spacy
22,0.009294,0.04902,0.015625,spacy
21,0.008834,0.04,0.014472,spacy
77,0.008032,0.034188,0.013008,spacy


In [73]:
rogue_lcs_spacy

Unnamed: 0,recall,precision,f1,module
62,0.217054,0.427481,0.287918,spacy
96,0.223881,0.4,0.287081,spacy
72,0.217557,0.407143,0.283582,spacy
39,0.215596,0.394958,0.278932,spacy
63,0.2,0.430894,0.273196,spacy
...,...,...,...,...
15,0.096257,0.1875,0.127208,spacy
26,0.083969,0.258824,0.126801,spacy
37,0.088435,0.220339,0.126214,spacy
95,0.087336,0.21978,0.125,spacy


# Evaluating NLTK summaries

In [74]:
def nltk_rouge_score(nltk_summ, ft_summ):
    rouge = Rouge()
    all_rouge_nltk = []
    for i in range(len(nltk_summ)):
        all_rouge_nltk.append(rouge.get_scores(ft_summ[i], nltk_summ[i]))
    
    rogue1_nltk = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])
    rogue2_nltk = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])
    rogue_lcs_nltk = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])

    for score in all_rouge_nltk:
        rouge1 = score[0]['rouge-1']
        rouge2 = score[0]['rouge-2']
        rouge_l = score[0]['rouge-l']

        rogue1_nltk = rogue1_nltk.append({"recall": rouge1['r'], "precision": rouge1['p'], "f1": rouge1['f'], 
                                    "module": 'nltk'}, ignore_index=True)
        rogue2_nltk = rogue2_nltk.append({"recall": rouge2['r'], "precision": rouge2['p'], "f1": rouge2['f'], 
                                    "module": 'nltk'}, ignore_index=True)
        rogue_lcs_nltk = rogue_lcs_nltk.append({"recall": rouge_l['r'], "precision": rouge_l['p'], "f1": rouge_l['f'], 
                                    "module": 'nltk'}, ignore_index=True)

    rogue1_nltk = rogue1_nltk.sort_values(by='f1', ascending = False)
    rogue2_nltk = rogue2_nltk.sort_values(by='f1', ascending = False)
    rogue_lcs_nltk = rogue_lcs_nltk.sort_values(by='f1', ascending = False)

    return rogue1_nltk, rogue2_nltk, rogue_lcs_nltk


In [75]:
rogue1_nltk, rogue2_nltk, rogue_lcs_nltk = nltk_rouge_score(nltk_comm, all_ft_comm)
rogue1_nltk

Unnamed: 0,recall,precision,f1,module
63,0.280374,0.487805,0.356083,nltk
52,0.287879,0.422222,0.342342,nltk
80,0.252525,0.458716,0.325733,nltk
83,0.267016,0.395349,0.31875,nltk
5,0.253886,0.422414,0.317152,nltk
...,...,...,...,...
31,0.122807,0.256098,0.166008,nltk
21,0.113281,0.29,0.162921,nltk
123,0.116183,0.217054,0.151351,nltk
15,0.117284,0.197917,0.147287,nltk


In [76]:
rogue2_nltk

Unnamed: 0,recall,precision,f1,module
5,0.075163,0.147436,0.099567,nltk
68,0.058309,0.142857,0.082816,nltk
76,0.060519,0.111702,0.078505,nltk
52,0.062857,0.103286,0.078153,nltk
74,0.059649,0.104938,0.076063,nltk
...,...,...,...,...
104,0.009926,0.02381,0.014011,nltk
57,0.010563,0.020408,0.013921,nltk
55,0.010899,0.018433,0.013699,nltk
77,0.008798,0.025641,0.0131,nltk


In [77]:
rogue_lcs_nltk

Unnamed: 0,recall,precision,f1,module
63,0.247664,0.430894,0.31454,nltk
5,0.233161,0.387931,0.291262,nltk
20,0.237374,0.370079,0.289231,nltk
52,0.237374,0.348148,0.282282,nltk
80,0.212121,0.385321,0.273616,nltk
...,...,...,...,...
26,0.100478,0.247059,0.142857,nltk
107,0.101322,0.20354,0.135294,nltk
123,0.103734,0.193798,0.135135,nltk
15,0.098765,0.166667,0.124031,nltk


# Using cosine-similarity for comparison

In [78]:
# Using CountVectorizer()
def calc_cos_sim_count_vec(ft_summary, my_summary):
    corpus = [ft_summary, my_summary]
    # Create the Document Term Matrix
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(corpus)

    # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, 
                    columns=count_vectorizer.vocabulary_.keys(), 
                    index=['ft_summary_org','ft_summary_crtd'])
    
    # Compute Cosine Similarity
    return cosine_similarity(df[0:1], df)

In [79]:
# # Using TF-IDF
# def calc_cos_sim_tfidf(ft_summary, my_summary):
#     corpus = [ft_summary, my_summary]
#     vectorizer = TfidfVectorizer()
#     trsfm=vectorizer.fit_transform(corpus)

#     # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
#     doc_term_matrix = trsfm.todense()
#     trsfm_df = pd.DataFrame(doc_term_matrix,
#                             columns=vectorizer.vocabulary_.keys(),
#                             index=['ft_summary_org','ft_summary_crtd'])

#     return cosine_similarity(trsfm[0:1], trsfm)

In [80]:
all_cos_sim_spacy = []
for i in range(len(spacy_comm)):
  all_cos_sim_spacy.append(calc_cos_sim_count_vec(all_ft_comm[i], spacy_comm[i])[0][1])

sorted(all_cos_sim_spacy, reverse=True)

[0.8232871328363739,
 0.8174248588298362,
 0.8145149399112754,
 0.7816383863403364,
 0.7707412659871002,
 0.7694462678570619,
 0.7659244889204044,
 0.7649044889725534,
 0.7634486278999365,
 0.7582895374148945,
 0.7518359657137371,
 0.7472140658882348,
 0.7429968507127381,
 0.7387260114298334,
 0.7347287011121258,
 0.7334503928271484,
 0.7310951615491647,
 0.7299371348265824,
 0.7296285367600028,
 0.7280359685613873,
 0.7261173975565213,
 0.7229427592932066,
 0.7208075167278749,
 0.7200696963190241,
 0.7195155789335859,
 0.7192314578474367,
 0.7184730379535103,
 0.7176907877381489,
 0.7174915295770368,
 0.7168908512875278,
 0.7164631047340766,
 0.7130990002287594,
 0.708139866273981,
 0.7077639126053901,
 0.7061962925226118,
 0.7060672642464483,
 0.7041160601434504,
 0.7036982541979113,
 0.7008160444039738,
 0.6982421515857751,
 0.6980133197158735,
 0.6949830538115703,
 0.6935688420806261,
 0.6921769122637016,
 0.6906989123143691,
 0.6892093880551476,
 0.6889921695571921,
 0.68834054212

In [81]:
all_cos_sim_nltk = []
for i in range(len(nltk_comm)):
  all_cos_sim_nltk.append(calc_cos_sim_count_vec(all_ft_comm[i], nltk_comm[i])[0][1])

sorted(all_cos_sim_nltk, reverse=True)

[0.7953239957993086,
 0.784270383288959,
 0.7728546233270182,
 0.7692639306166029,
 0.7603813000949753,
 0.7593104742149936,
 0.7581093603642679,
 0.7537184963392365,
 0.749825777576924,
 0.7475982045617435,
 0.7405160279629593,
 0.7391730797696568,
 0.7327699869772225,
 0.7243865317669208,
 0.7227952225480624,
 0.7211585276842527,
 0.7210127545462429,
 0.7193756582020947,
 0.7188542025873198,
 0.7182557335991091,
 0.7175172291598655,
 0.705042436618654,
 0.704912898855007,
 0.7020957136137166,
 0.7000223391806029,
 0.699979695800079,
 0.6937673595094522,
 0.6910079483143473,
 0.6902610403846762,
 0.6843671563031595,
 0.6830959423188732,
 0.6827604968028924,
 0.6824310730523295,
 0.6812806487187413,
 0.6792623250621805,
 0.6737007375350063,
 0.6720868312538996,
 0.6710502891129948,
 0.6705873056132569,
 0.6701009371571429,
 0.6671667649705758,
 0.6663370783926228,
 0.6653998750443286,
 0.6650983085792687,
 0.6650702246545304,
 0.6640285896984353,
 0.6625617768310097,
 0.662243488322648

In [84]:
spacy_comm[0]

'lewandowski combines with goretzka on the edge of the box, but the midfielder strikes his effort straight at araujo, and the ball deflects into the hands of ter stegen.    sergi roberto tries to break down the right flank to latch on to a long ball forward from alba, but pressure from davies forces the goal-kick.    garcia catches lewandowski late on the edge of the box and gives away a free-kick in a dangerous position.    goretzka wins the ball in the middle of the park and offloads towards sane, but his pass to muller in the left-inside channel is poor and cut out by araujo.    goretzka moves forward from the middle of the park and tries to tease a ball into the path of muller, but pique makes the interception.    the forward opens up space for a strike in the right-inside channel, but pique slides in with a low block before the home side scramble the loose ball away from danger.    sane completes a short offload to muller in space on the edge of the box and the forward does the re