# Importing the libraries

In [2]:
import pandas as pd
import numpy as np

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

import openai

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Import the dataset

In [3]:
comm_df = pd.read_csv('../../data/commentory_matchid.csv')
comm_df = comm_df[comm_df['match_id'] != 95]
comm_df.head()

Unnamed: 0,time,comment,event,event_player,event_team,comment_desc,home_team,home_team_abbr,away_team,away_team_abbr,full_time_score,match,date,link,match_id
0,,thanks for joining our commentary this evenin...,,,,full time summary,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
1,,barcelona are next in action at home in lalig...,,,,full time summary,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
2,,bayern munich have eased past barcelona in th...,,,,full time summary,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
3,90 + 2,full-time: barcelona 0-3 bayern munich,,,,timer,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
4,90,there will be two minutes of added time.,,,,timer,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0


# Use sliding window technique with 15 mins buffer for the match live ticker

In [4]:
def window_df(df, start_timer, end_timer):
    # Convert time column to str
    df['time'] = df['time'].astype(str)
    comments = []
    for i in range(df.shape[0]):
        time = df['time'][i]
        if time != 'nan':
            if '+' in time:
                time = time[:2]
            if int(time) >= start_timer and int(time) < end_timer:
                if df['comment_desc'][i] == 'timer':
                    comments.append(df['comment'][i])
    return " ".join(comments)

In [5]:
def create_window(df):
    all_comm = []
    match_ids = df['match_id'].unique()
    for id in match_ids:
        # Filter the dataframe w.r.t match_id
        match_df = df[df['match_id'] == id]
        match_df.reset_index(inplace = True, drop = True)

        # Divide the dataframe into 6 separate dfs, each corresponding to 15 minutes of the match.
        comm_15 = window_df(match_df, 0, 16)
        comm_30 = window_df(match_df, 16, 31)
        comm_45 = window_df(match_df, 31, 46)
        comm_60 = window_df(match_df, 46, 61)
        comm_75 = window_df(match_df, 61, 76)
        comm_90 = window_df(match_df, 76, 91)

        # Append the respective live tickers to a list
        all_comm.append([comm_15, comm_30, comm_45, comm_60, comm_75, comm_90])
    
    return all_comm

In [6]:
commentaries = create_window(comm_df)

In [7]:
# Will be used later for summaries
timer_list= ['[1-15]', '[16-30]', '[31-45]', '[46-60]', '[61-75]', '[76-90]']

# Extractive text summarization

## Using SpaCy to summarize text

In [31]:
nlp = spacy.load("en_core_web_sm")

def get_spacy_summary(text):
    doc = nlp(text)

    keyword = []
    stopwords = list(STOP_WORDS)
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
    for token in doc:
        if (token.text in stopwords or token.text in punctuation):
            continue
        if token.pos_ in pos_tag:
            keyword.append(token.text)

    freq_word = Counter(keyword)
    max_freq = Counter(keyword).most_common(1)[0][1]
    for word in freq_word.keys():
        freq_word[word] = (freq_word[word]/max_freq)

    sent_strenght = {}
    for sent in doc.sents:
        for word in sent:
            if word.text in freq_word.keys():
                if sent in sent_strenght.keys():
                    sent_strenght[sent] += freq_word[word.text]
                else:
                    sent_strenght[sent] = freq_word[word.text]

    summarized_sentences = nlargest(3, sent_strenght, key=sent_strenght.get)
    final_sentences = [w.text for w in summarized_sentences]
    return " ".join(final_sentences)

In [75]:
def get_all_comm_spacy(comms):
    all_comm_spacy = []
    for commentary in comms:
        spacy_window_comm = []
        for comment in commentary:
            spacy_window_comm.append(get_spacy_summary(comment))
        all_comm_spacy.append(spacy_window_comm)
        # all_comm_spacy.append(" ".join(spacy_window_comm))
    return all_comm_spacy

In [76]:
spacy_comm = get_all_comm_spacy(commentaries)
spacy_comm_joined = []
for comms in spacy_comm:
    spacy_comm_joined.append(" ".join(comms))

## Using NLTK for text summarization

In [87]:
def remove_punct(text):
    """ A method to remove punctuations from text """
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text) #removes numbers from text
    return text

In [88]:
def tokenization(text):
    """ A method to tokenize text data """
    text = re.split('\W+', text) #splitting each sentence/ tweet into its individual words
    return text

In [89]:
def summarize_text_nltk(text, num_sentences=3):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize the sentences into words and remove stopwords
    
    text_punct_removed = remove_punct(text)
    words = tokenization(text_punct_removed.lower())
    
    # words = word_tokenize(text)

    # remove stopwords
    stop_words =  set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    # Apply stemming to the filtered words
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    # Lemmatize words
    word_net_lemma = nltk.WordNetLemmatizer()
    word_lemma = [word_net_lemma.lemmatize(word) for word in stemmed_words]
    
    # Calculate word frequency and sentence scores
    word_freq = nltk.FreqDist(stemmed_words)
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_freq:
                if len(sentence.split()) < 30:
                    if i not in sentence_scores:
                        sentence_scores[i] = word_freq[word]
                    else:
                        sentence_scores[i] += word_freq[word]
    
    # Select the top sentences based on their scores
    summary_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    summary = [sentences[i] for i in sorted(summary_sentences)]
    return " ".join(summary)

In [None]:
def get_all_comm_spacy(comms):
    all_comm_spacy = []
    for commentary in comms:
        spacy_window_comm = []
        for comment in commentary:
            spacy_window_comm.append(get_spacy_summary(comment))
        all_comm_spacy.append(spacy_window_comm)
        # all_comm_spacy.append(" ".join(spacy_window_comm))
    return all_comm_spacy

In [97]:
def get_all_comm_nltk(comms):
    all_comm_nltk = []
    for commentary in comms:
        nltk_window_comm = []
        for comment in commentary:
            nltk_window_comm.append(summarize_text_nltk(comment))
        all_comm_nltk.append(nltk_window_comm)
    return all_comm_nltk

In [98]:
nltk_comm = get_all_comm_nltk(commentaries)
nltk_comm_joined = []
for comms in nltk_comm:
    nltk_comm_joined.append(" ".join(comms))

In [99]:
nltk_comm_joined[0]

'lewandowski combines with goretzka on the edge of the box, but the midfielder strikes his effort straight at araujo, and the ball deflects into the hands of ter stegen. muller flicks a ball forward into the barcelona box for lewandowski. sergi roberto tries to break down the right flank to latch on to a long ball forward from alba, but pressure from davies forces the goal-kick.  alba plays depay into space down the left flank and he sends a cutback into the middle that upamecano has to clear ahead of luuk de jong. goretzka moves forward from the middle of the park and tries to tease a ball into the path of muller, but pique makes the interception. musiala makes a neat turn in the final third and tries to thread an offload through to lewandowski, but his pass leads the pole away from goal. sane completes a short offload to muller in space on the edge of the box and the forward does the rest as his deflected strike beats ter stegen. the visitors take the lead through muller as he lines 

# Get full time summary from comment description

In [16]:
def get_full_match_summ(df):
    all_ft_comm = []
    match_ids = df['match_id'].unique()
    for id in match_ids:
        # Filter the dataframe w.r.t match_id
        match_df = df[df['match_id'] == id]
        all_ft_comm.append(" ".join(match_df[match_df['comment_desc'] == 'full time summary']['comment']))
    
    return all_ft_comm

In [17]:
all_ft_comm = get_full_match_summ(comm_df)

# Using ROGUE Score as evaluation metric

In [18]:
# !pip install rouge

# Evaluating SpaCy summaries

In [159]:
def spacy_rouge_score(spacy_summ, ft_summ):
    rouge = Rouge()
    all_rouge_spacy = []
    for i in range(len(spacy_summ)):
        all_rouge_spacy.append(rouge.get_scores(ft_summ[i], spacy_summ[i]))
    
    rogue1_spacy = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])
    rogue2_spacy = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])
    rogue_lcs_spacy = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])

    for score in all_rouge_spacy:
        rouge1 = score[0]['rouge-1']
        rouge2 = score[0]['rouge-2']
        rouge_l = score[0]['rouge-l']

        rogue1_spacy = rogue1_spacy.append({"recall": rouge1['r'], "precision": rouge1['p'], "f1": rouge1['f'], 
                                    "module": 'spacy'}, ignore_index=True)
        rogue2_spacy = rogue2_spacy.append({"recall": rouge2['r'], "precision": rouge2['p'], "f1": rouge2['f'], 
                                    "module": 'spacy'}, ignore_index=True)
        rogue_lcs_spacy = rogue_lcs_spacy.append({"recall": rouge_l['r'], "precision": rouge_l['p'], "f1": rouge_l['f'], 
                                    "module": 'spacy'}, ignore_index=True)

    # rogue1_spacy = rogue1_spacy.sort_values(by='precision', ascending = False)
    # rogue2_spacy = rogue2_spacy.sort_values(by='precision', ascending = False)
    # rogue_lcs_spacy = rogue_lcs_spacy.sort_values(by='precision', ascending = False)

    return rogue1_spacy, rogue2_spacy, rogue_lcs_spacy


In [160]:
rogue1_spacy, rogue2_spacy, rogue_lcs_spacy = spacy_rouge_score(spacy_comm_joined, all_ft_comm)
rogue1_spacy.to_csv('rogue1_spacy.csv')

In [161]:
rogue1_spacy

Unnamed: 0,recall,precision,f1,module
0,0.171717,0.34,0.228188,spacy
1,0.182131,0.53,0.2711,spacy
2,0.159574,0.267857,0.2,spacy
3,0.203636,0.358974,0.259861,spacy
4,0.164751,0.349593,0.223958,spacy
...,...,...,...,...
119,0.242268,0.367188,0.291925,spacy
120,0.149123,0.32381,0.204204,spacy
121,0.151261,0.324324,0.206304,spacy
122,0.211454,0.328767,0.257373,spacy


In [162]:
rogue2_spacy

Unnamed: 0,recall,precision,f1,module
0,0.032345,0.097561,0.048583,spacy
1,0.029466,0.114286,0.046852,spacy
2,0.019048,0.039216,0.025641,spacy
3,0.042644,0.089686,0.057803,spacy
4,0.030093,0.069892,0.042071,spacy
...,...,...,...,...
119,0.053672,0.106145,0.071295,spacy
120,0.02381,0.072993,0.035907,spacy
121,0.037838,0.098592,0.054687,spacy
122,0.041958,0.082949,0.055728,spacy


In [163]:
rogue2_spacy.to_csv('rogue2_spacy.csv', index=False)

In [164]:
rogue_lcs_spacy

Unnamed: 0,recall,precision,f1,module
0,0.161616,0.32,0.214765,spacy
1,0.151203,0.44,0.225064,spacy
2,0.138298,0.232143,0.173333,spacy
3,0.181818,0.320513,0.232019,spacy
4,0.149425,0.317073,0.203125,spacy
...,...,...,...,...
119,0.221649,0.335938,0.267081,spacy
120,0.122807,0.266667,0.168168,spacy
121,0.138655,0.297297,0.189112,spacy
122,0.189427,0.294521,0.230563,spacy


# Evaluating NLTK summaries

In [166]:
def nltk_rouge_score(nltk_summ, ft_summ):
    rouge = Rouge()
    all_rouge_nltk = []
    for i in range(len(nltk_summ)):
        all_rouge_nltk.append(rouge.get_scores(ft_summ[i], nltk_summ[i]))
    
    rogue1_nltk = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])
    rogue2_nltk = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])
    rogue_lcs_nltk = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])

    for score in all_rouge_nltk:
        rouge1 = score[0]['rouge-1']
        rouge2 = score[0]['rouge-2']
        rouge_l = score[0]['rouge-l']

        rogue1_nltk = rogue1_nltk.append({"recall": rouge1['r'], "precision": rouge1['p'], "f1": rouge1['f'], 
                                    "module": 'nltk'}, ignore_index=True)
        rogue2_nltk = rogue2_nltk.append({"recall": rouge2['r'], "precision": rouge2['p'], "f1": rouge2['f'], 
                                    "module": 'nltk'}, ignore_index=True)
        rogue_lcs_nltk = rogue_lcs_nltk.append({"recall": rouge_l['r'], "precision": rouge_l['p'], "f1": rouge_l['f'], 
                                    "module": 'nltk'}, ignore_index=True)

    # rogue1_nltk = rogue1_nltk.sort_values(by='precision', ascending = False)
    # rogue2_nltk = rogue2_nltk.sort_values(by='precision', ascending = False)
    # rogue_lcs_nltk = rogue_lcs_nltk.sort_values(by='precision', ascending = False)

    return rogue1_nltk, rogue2_nltk, rogue_lcs_nltk


In [167]:
rogue1_nltk, rogue2_nltk, rogue_lcs_nltk = nltk_rouge_score(nltk_comm_joined, all_ft_comm)
rogue1_nltk

Unnamed: 0,recall,precision,f1,module
0,0.184783,0.34,0.239437,nltk
1,0.188406,0.39,0.254072,nltk
2,0.190476,0.25,0.216216,nltk
3,0.225962,0.301282,0.258242,nltk
4,0.182692,0.308943,0.229607,nltk
...,...,...,...,...
119,0.26875,0.335938,0.298611,nltk
120,0.169231,0.314286,0.22,nltk
121,0.152381,0.288288,0.199377,nltk
122,0.238095,0.273973,0.254777,nltk


In [168]:
rogue1_nltk.to_csv('rogue1_nltk.csv', index=False)

In [169]:
rogue2_nltk

Unnamed: 0,recall,precision,f1,module
0,0.029155,0.081301,0.042918,nltk
1,0.03207,0.078571,0.045549,nltk
2,0.015444,0.026144,0.019417,nltk
3,0.036585,0.053812,0.043557,nltk
4,0.033333,0.05914,0.042636,nltk
...,...,...,...,...
119,0.05102,0.083799,0.063425,nltk
120,0.014837,0.036496,0.021097,nltk
121,0.034161,0.077465,0.047414,nltk
122,0.047468,0.069124,0.056285,nltk


In [170]:
rogue2_nltk.to_csv('rogue2_nltk.csv', index=False)

In [171]:
rogue_lcs_nltk

Unnamed: 0,recall,precision,f1,module
0,0.179348,0.33,0.232394,nltk
1,0.154589,0.32,0.208469,nltk
2,0.190476,0.25,0.216216,nltk
3,0.201923,0.269231,0.230769,nltk
4,0.163462,0.276423,0.205438,nltk
...,...,...,...,...
119,0.23125,0.289062,0.256944,nltk
120,0.117949,0.219048,0.153333,nltk
121,0.142857,0.27027,0.186916,nltk
122,0.214286,0.246575,0.229299,nltk


In [172]:
rogue_lcs_nltk.to_csv('rogue_lcs_nltk.csv', index=False)

# Using cosine-similarity for comparison

In [None]:
# # Using CountVectorizer()
# def calc_cos_sim_count_vec(ft_summary, my_summary):
#     corpus = [ft_summary, my_summary]
#     # Create the Document Term Matrix
#     count_vectorizer = CountVectorizer()
#     sparse_matrix = count_vectorizer.fit_transform(corpus)

#     # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
#     doc_term_matrix = sparse_matrix.todense()
#     df = pd.DataFrame(doc_term_matrix, 
#                     columns=count_vectorizer.vocabulary_.keys(), 
#                     index=['ft_summary_org','ft_summary_crtd'])
    
#     # Compute Cosine Similarity
#     return cosine_similarity(df[0:1], df)

In [None]:
# # Using TF-IDF
# def calc_cos_sim_tfidf(ft_summary, my_summary):
#     corpus = [ft_summary, my_summary]
#     vectorizer = TfidfVectorizer()
#     trsfm=vectorizer.fit_transform(corpus)

#     # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
#     doc_term_matrix = trsfm.todense()
#     trsfm_df = pd.DataFrame(doc_term_matrix,
#                             columns=vectorizer.vocabulary_.keys(),
#                             index=['ft_summary_org','ft_summary_crtd'])

#     return cosine_similarity(trsfm[0:1], trsfm)

In [None]:
# all_cos_sim_spacy = []
# for i in range(len(spacy_comm)):
#   all_cos_sim_spacy.append(calc_cos_sim_count_vec(all_ft_comm[i], spacy_comm[i])[0][1])

# sorted(all_cos_sim_spacy, reverse=True)

In [None]:
# all_cos_sim_nltk = []
# for i in range(len(nltk_comm)):
#   all_cos_sim_nltk.append(calc_cos_sim_count_vec(all_ft_comm[i], nltk_comm[i])[0][1])

# sorted(all_cos_sim_nltk, reverse=True)

# Abstractive text summarization

# GPT-3

In [127]:
def summarize_text_gpt(corpus, org_key, api_key):
# def summarize_text_gpt(input_sequence, output_sequence,org_key, api_key):
    openai.organization = org_key
    openai.api_key = api_key
    engine_list = openai.Engine.list() # calling the engines available from the openai api 

    response = openai.Completion.create(engine="davinci",prompt=corpus,temperature=0.7,
            max_tokens=2048,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n"]
        )
    # response = openai.Model("davinci").train(input_sequence = input_sequence,
    #                                             output_sequence = output_sequence,
    #                                             model = "davinci",
    #                                             temperature=0.7,
    #                                             max_tokens=2048,
    #                                             batch_size = 1,
    #                                             epochs = 10
        # )
    return response["choices"][0]["text"]
    # return response

In [128]:
def get_all_comm_gpt(comms, org_key, api_key):
    all_comm_gpt = []
    for commentary in comms:
        gpt_window_comm = []
        for comment in commentary:
            gpt_window_comm.append(summarize_text_gpt(comment, org_key, api_key))
        all_comm_gpt.append(" ".join(gpt_window_comm))
    return all_comm_gpt

In [130]:
# reading keys from file

api = pd.read_csv('../../../OpenAI_rs.txt')

api_key = api["Key"][0]
org_key = api["Key"][1]
gpt_comm = get_all_comm_gpt(spacy_comm_joined[:1], org_key, api_key)

KeyboardInterrupt: 

In [None]:
gpt_comm

# Evaluating GPT-3 summaries

In [None]:
def gpt_rouge_score(gpt_summ, ft_summ):
    rouge = Rouge()
    all_rouge_gpt = []
    for i in range(len(gpt_summ)):
        all_rouge_gpt.append(rouge.get_scores(ft_summ[i], gpt_summ[i]))
    
    rogue1_gpt = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])
    rogue2_gpt = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])
    rogue_lcs_gpt = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])

    for score in all_rouge_gpt:
        rouge1 = score[0]['rouge-1']
        rouge2 = score[0]['rouge-2']
        rouge_l = score[0]['rouge-l']

        rogue1_gpt = rogue1_gpt.append({"recall": rouge1['r'], "precision": rouge1['p'], "f1": rouge1['f'], 
                                    "module": 'nltk'}, ignore_index=True)
        rogue2_gpt = rogue2_gpt.append({"recall": rouge2['r'], "precision": rouge2['p'], "f1": rouge2['f'], 
                                    "module": 'nltk'}, ignore_index=True)
        rogue_lcs_gpt = rogue_lcs_gpt.append({"recall": rouge_l['r'], "precision": rouge_l['p'], "f1": rouge_l['f'], 
                                    "module": 'nltk'}, ignore_index=True)

    rogue1_gpt = rogue1_gpt.sort_values(by='f1', ascending = False)
    rogue2_gpt = rogue2_gpt.sort_values(by='f1', ascending = False)
    rogue_lcs_gpt = rogue_lcs_gpt.sort_values(by='f1', ascending = False)

    return rogue1_gpt, rogue2_gpt, rogue_lcs_gpt


In [None]:
rogue1_gpt, rogue2_gpt, rogue_lcs_gpt = gpt_rouge_score(gpt_comm[:2], all_ft_comm[:2])
rogue1_gpt

In [None]:
rogue2_gpt

In [None]:
rogue_lcs_gpt

# T5

In [35]:
!pip install transformers==2.8.0
!pip install torch==1.4.0

Collecting transformers==2.8.0
  Using cached transformers-2.8.0-py3-none-any.whl (563 kB)
Collecting boto3
  Downloading boto3-1.26.126-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting tokenizers==0.5.2
  Using cached tokenizers-0.5.2.tar.gz (64 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting s3transfer<0.7.0,>=0.6.0
  Using cached s3transfer-0.6.0-py3-none-any.whl (79 kB)
Collecting jmespath<2.0.0,>=0.7.1
  Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting botocore<1.30.0,>=1.29.126
  Downloading botocore-1.29.126-py3-none-any.whl (10.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Building wheels for collected packages: tok

In [58]:
import torch
import json 
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

def summarize_text_t5(text):
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    device = torch.device('cpu')


    preprocess_text = text.strip().replace("\n","")
    t5_prepared_Text = "summarize: "+preprocess_text
    tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)

    # summmarize 
    summary_ids = model.generate(tokenized_text,
                                        num_beams=4,
                                        no_repeat_ngram_size=2,
                                        min_length=30,
                                        max_length=100,
                                        early_stopping=True)

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [59]:
def get_all_comm_t5(comms):
    all_comm_t5 = []
    for commentary in comms:
        t5_window_comm = []
        for comment in commentary:
            summary = summarize_text_t5(comment)
            t5_window_comm.append(summary)
        all_comm_t5.append(" ".join(t5_window_comm))
    return all_comm_t5

In [105]:
len(spacy_comm)

124

In [106]:
spacy_comm[0]

['lewandowski combines with goretzka on the edge of the box, but the midfielder strikes his effort straight at araujo, and the ball deflects into the hands of ter stegen.    sergi roberto tries to break down the right flank to latch on to a long ball forward from alba, but pressure from davies forces the goal-kick.    garcia catches lewandowski late on the edge of the box and gives away a free-kick in a dangerous position.   ',
 'goretzka wins the ball in the middle of the park and offloads towards sane, but his pass to muller in the left-inside channel is poor and cut out by araujo.    goretzka moves forward from the middle of the park and tries to tease a ball into the path of muller, but pique makes the interception.    the forward opens up space for a strike in the right-inside channel, but pique slides in with a low block before the home side scramble the loose ball away from danger.   ',
 'sane completes a short offload to muller in space on the edge of the box and the forward do

In [110]:
t5_comm = get_all_comm_t5(spacy_comm)
t5_comm

["sergi roberto tries to break down the right flank to latch on to a long ball forward from alba. the goal-kick deflects into the hands of ter stegen, but pressure from davies forces lewandowski late on the edge of the box. goretzka moves forward from the middle of the park and offloads towards sane. but his pass to muller in the left-inside channel is poor and cut out by araujo. barcelona beat bayern munich alba 0-1 at the emirates stadium. sane's deflected strike beats ter stegen in the second half of the first half and muller resists the opening for a strike - and the forward does the rest as he tries to thump lewandowski into the final third. bayern play their way through barca in the middle of the park before musiala latches on to a blocked strike. lewandowski is first to react and stretches out his left boot to knock into an empty net. coutinho cuts inside from the right flank and blasts a strike towards the top-right corner that sails just over the bar. ter stegen gets down to s

In [122]:
spacy_comm_joined[28]

"adeyemi won three first-half penalties in their 1-1 draw with sevilla on matchday one - since 2003-04, only one player has ever won more penalties in a single season in the competition (arjen robben in 2013-14, four).    onguene stretches to hack clear david's cross, just before yilmaz can arrive to knock it in from close range.    okafor attempts to break away down the right flank, but gudmundsson stays with him and eventually wins back possession near the byline.     botman is shown a yellow card for the challenge that conceded the penalty. yilmaz fights hard to knock down a long ball forward in search of weah, but he's quickly halted by an offside flag.    botman rises highest to meet gudmundsson's corner, but he sends his header well wide.    ulmer heads xeka's corner clear at the back post.   adeyemi has now won four penalties across the opening two games in the group. he cuts it back to adeyemi who has plenty of the goal to aim at, but he can only place his shot against fonte an

In [120]:
t5_comm[28]

"adeyemi won three first-half penalties in their 1-1 draw with sevilla on matchday one. only one player has ever won more penalty in one season in the competition (arjen robben in 2013-14, four) onguene stretches to hack clear david's cross before yilmaz can knock it in from close range, but gudmundsson stays with him and eventually wins back possession near the botman is shown a yellow card for the challenge that conceded the penalty. yilmaz is halted by an offside flag, but he sends his header well wide. ulmer heads xeka's corner clear at the back post. adeyemi has now won four penalties across the opening two games in the group - four against fonte, lille and valencia. salzburg have a clear two-goal lead in the premier league. lille start the half with intent before weah slamming in an early cross - but the referee gives salziburg 'a free-kick' lille are starting to put salzburg under real pressure as they look to complete their comeback. yilmaz steps up and fizzes a powerful strike

In [111]:
t5_summ_df = pd.DataFrame(columns=['summary'])
for comm in t5_comm:
    t5_summ_df = t5_summ_df.append({"summary": comm}, ignore_index=True)
t5_summ_df

Unnamed: 0,summary
0,sergi roberto tries to break down the right fl...
1,dynamo kiev have met benfica four times previo...
2,fernandes' elia gets the better of fred and re...
3,zappacosta puts atalanta on the front foot wit...
4,carlos bodychecks adeyemi on the edge of the s...
...,...
119,de bruyne whips a dangerous cross into the mid...
120,villarreal have their first press down the lef...
121,capoue evades robertson at the far post to cus...
122,real madrid beat manchester city 3-1 at the et...


In [112]:
t5_summ_df.to_csv('t5_summary.csv', index=False)

In [173]:
def t5_rouge_score(t5_summ, ft_summ):
    rouge = Rouge()
    all_rouge_t5 = []
    for i in range(len(t5_summ)):
        all_rouge_t5.append(rouge.get_scores(ft_summ[i], t5_summ[i]))
    
    rogue1_t5 = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])
    rogue2_t5 = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])
    rogue_lcs_t5 = pd.DataFrame(columns=['recall', 'precision', 'f1', 'module'])

    for score in all_rouge_t5:
        rouge1 = score[0]['rouge-1']
        rouge2 = score[0]['rouge-2']
        rouge_l = score[0]['rouge-l']

        rogue1_t5 = rogue1_t5.append({"recall": rouge1['r'], "precision": rouge1['p'], "f1": rouge1['f'], 
                                    "module": 't5'}, ignore_index=True)
        rogue2_t5 = rogue2_t5.append({"recall": rouge2['r'], "precision": rouge2['p'], "f1": rouge2['f'], 
                                    "module": 't5'}, ignore_index=True)
        rogue_lcs_t5 = rogue_lcs_t5.append({"recall": rouge_l['r'], "precision": rouge_l['p'], "f1": rouge_l['f'], 
                                    "module": 't5'}, ignore_index=True)

    # rogue1_t5 = rogue1_t5.sort_values(by='precision', ascending = False)
    # rogue2_t5 = rogue2_t5.sort_values(by='precision', ascending = False)
    # rogue_lcs_t5 = rogue_lcs_t5.sort_values(by='precision', ascending = False)

    return rogue1_t5, rogue2_t5, rogue_lcs_t5


In [174]:
rogue1_t5, rogue2_t5, rogue_lcs_t5 = t5_rouge_score(t5_comm, all_ft_comm)
rogue1_t5

Unnamed: 0,recall,precision,f1,module
0,0.234848,0.31,0.267241,t5
1,0.21519,0.34,0.263566,t5
2,0.22314,0.241071,0.23176,t5
3,0.256944,0.237179,0.246667,t5
4,0.201342,0.243902,0.220588,t5
...,...,...,...,...
119,0.362069,0.328125,0.344262,t5
120,0.197279,0.27619,0.230159,t5
121,0.181818,0.234234,0.204724,t5
122,0.276596,0.267123,0.271777,t5


In [175]:
rogue2_t5

Unnamed: 0,recall,precision,f1,module
0,0.039024,0.065041,0.04878,t5
1,0.032258,0.057143,0.041237,t5
2,0.02994,0.03268,0.03125,t5
3,0.044335,0.040359,0.042254,t5
4,0.041284,0.048387,0.044554,t5
...,...,...,...,...
119,0.105882,0.100559,0.103152,t5
120,0.031674,0.051095,0.039106,t5
121,0.042654,0.06338,0.050991,t5
122,0.059091,0.059908,0.059497,t5


In [176]:
rogue2_t5.to_csv('rogue2_t5.csv', index=False)

In [177]:
rogue_lcs_t5

Unnamed: 0,recall,precision,f1,module
0,0.212121,0.28,0.241379,t5
1,0.183544,0.29,0.224806,t5
2,0.198347,0.214286,0.206009,t5
3,0.201389,0.185897,0.193333,t5
4,0.181208,0.219512,0.198529,t5
...,...,...,...,...
119,0.318966,0.289062,0.303279,t5
120,0.156463,0.219048,0.18254,t5
121,0.167832,0.216216,0.188976,t5
122,0.262411,0.253425,0.25784,t5


In [124]:
all_ft_comm[24]

" that's all from us today folks, we hope you enjoyed the game!   this result leaves dynamo third in group e on just one point, but they face bottom side barcelona next. meanwhile bayern travel to benfica in three weeks time.   bayern wrap up a comfortable 5-0 victory over dynamo kiev at the allianz arena, to make it two wins from two in group e and they sit top on six points. they led 2-0 at the break thanks to a brace from lewandowski and they continued to exert their dominance in the second period, with gnabry scoring a third, before sane got lucky with a cross from the left wing that flew straight into the back of the net. it looked like it would finish 4-0, but choupo-moting scored in the final minutes to put the icing on the cake. "

In [125]:
t5_comm[24]

"sydorchuk gave away the penalty after a handball in the box. lewandowski coolly stepped up to send bushchan the wrong way and fire the ball into the bottom left corner of the net to deny the striker's gnabry goal. bayern win a free-kick after lewandowski is fouled and sane steps up to take it. kimmich then tries to chip it over the top of the dynamo defence to find the striker, who can't control the ball and powers volley towards the bottom left corner of net. bayern 2-0 dynamo kiev the other game being played tonight in group e is between benfica and barcelona. the portuguese side are currently leading 1-0 in that one thanks to an early goal from goretzka and sane only has bushchan to beat from a tight angle. lewandowski plays a lovely ball in behind the bayern defence for sane to sprint onto the end of. dynamo counter quickly after winning the ball back in midfield and the pass is slightly too heavy and bushchan is quick off his line to sack it behind for the corner. neuer does bril

In [126]:
spacy_comm_joined[24]

"sydorchuk gave away the penalty after a handball in the box and lewandowski coolly stepped up to send bushchan the wrong way and fire the ball into the bottom left corner of the net.    bushchan is called into action at the other end of the pitch to deny lewandowski, after the striker got above his marker and powered a header towards goal following a cross from the right by gnabry. the hosts are handed a huge chance to open the scoring early on after sydorchuk is penalised in the box, after he deliberately blocks a cross with his arm and the referee had no choice but to award a spot-kick.    bayern win a free-kick in a decent position after lewandowski is fouled and sane steps up to take it, but plays it short to kimmich, who then tries to chip it over the top of the dynamo defence to find sane. upamecano plays a good pass into the feet of lewandowski, but the striker can't control the ball and it falls to sule on the edge of the box, who powers a volley towards the bottom left corner

In [None]:
all_ft_comm

In [178]:
rogue1_spacy.to_csv('rogue1_spacy.csv', index=False)
rogue1_spacy

Unnamed: 0,recall,precision,f1,module
0,0.171717,0.34,0.228188,spacy
1,0.182131,0.53,0.2711,spacy
2,0.159574,0.267857,0.2,spacy
3,0.203636,0.358974,0.259861,spacy
4,0.164751,0.349593,0.223958,spacy
...,...,...,...,...
119,0.242268,0.367188,0.291925,spacy
120,0.149123,0.32381,0.204204,spacy
121,0.151261,0.324324,0.206304,spacy
122,0.211454,0.328767,0.257373,spacy


In [179]:
rogue1_t5.to_csv('rogue1_t5.csv', index=False)
rogue1_t5

Unnamed: 0,recall,precision,f1,module
0,0.234848,0.31,0.267241,t5
1,0.21519,0.34,0.263566,t5
2,0.22314,0.241071,0.23176,t5
3,0.256944,0.237179,0.246667,t5
4,0.201342,0.243902,0.220588,t5
...,...,...,...,...
119,0.362069,0.328125,0.344262,t5
120,0.197279,0.27619,0.230159,t5
121,0.181818,0.234234,0.204724,t5
122,0.276596,0.267123,0.271777,t5


In [180]:
rogue_lcs_spacy.to_csv('rogue_lcs_spacy.csv', index=False)
rogue_lcs_spacy

Unnamed: 0,recall,precision,f1,module
0,0.161616,0.32,0.214765,spacy
1,0.151203,0.44,0.225064,spacy
2,0.138298,0.232143,0.173333,spacy
3,0.181818,0.320513,0.232019,spacy
4,0.149425,0.317073,0.203125,spacy
...,...,...,...,...
119,0.221649,0.335938,0.267081,spacy
120,0.122807,0.266667,0.168168,spacy
121,0.138655,0.297297,0.189112,spacy
122,0.189427,0.294521,0.230563,spacy


In [181]:
rogue_lcs_t5.to_csv('rogue_lcs_t5.csv', index=False)
rogue_lcs_t5

Unnamed: 0,recall,precision,f1,module
0,0.212121,0.28,0.241379,t5
1,0.183544,0.29,0.224806,t5
2,0.198347,0.214286,0.206009,t5
3,0.201389,0.185897,0.193333,t5
4,0.181208,0.219512,0.198529,t5
...,...,...,...,...
119,0.318966,0.289062,0.303279,t5
120,0.156463,0.219048,0.18254,t5
121,0.167832,0.216216,0.188976,t5
122,0.262411,0.253425,0.25784,t5


In [155]:
rogue_lcs_t5.index

24

In [146]:
import altair as alt

ModuleNotFoundError: No module named 'altair'

In [157]:
rouge1_prec_df = pd.DataFrame(columns=['spacy_precision', 't5_precision', 'match_id'])
spacy_prec1 = rogue1_spacy['precision']
t5_prec1 = rogue1_t5['precision']

for i in range(len(spacy_prec1)):
    rouge1_prec_df = rouge1_prec_df.append({"spacy_precision": spacy_prec1[i], "t5_precision": t5_prec1[i], 
                                            "match_id": rogue1_spacy.index[i]}, ignore_index=True)
rouge1_prec_df

Unnamed: 0,spacy_precision,t5_precision,match_id
0,0.34,0.31,1.0
1,0.53,0.34,24.0
2,0.267857,0.241071,63.0
3,0.358974,0.237179,19.0
4,0.349593,0.243902,84.0
...,...,...,...
119,0.367188,0.328125,123.0
120,0.32381,0.27619,9.0
121,0.324324,0.234234,34.0
122,0.328767,0.267123,60.0


In [187]:
rogue_lcs_spacy['precision'][0]

0.32

In [196]:
rouge_extrac_df = pd.DataFrame(columns=['spacy_uni_precision', 'nltk_uni_precision', 'spacy_lcs_precision', 'nltk_lcs_precision', 'match_id'])
spacy_uni_prec = rogue1_spacy['precision']
nltk_uni_prec = rogue1_nltk['precision']
spacy_lcs_prec = rogue_lcs_spacy['precision']
nltk_lcs_prec = rogue_lcs_nltk['precision']

for i in range(len(spacy_uni_prec)):
    rouge_extrac_df = rouge_extrac_df.append({"spacy_uni_precision": spacy_uni_prec[i], "nltk_uni_precision": nltk_uni_prec[i], 
                                            "spacy_lcs_precision": spacy_lcs_prec[i], 'nltk_lcs_precision': nltk_lcs_prec[i],
                                            "match_id": rogue1_spacy.index[i]}, ignore_index=True)
rouge_extrac_df

Unnamed: 0,spacy_uni_precision,nltk_uni_precision,spacy_lcs_precision,nltk_lcs_precision,match_id
0,0.34,0.34,0.32,0.33,0.0
1,0.53,0.39,0.44,0.32,1.0
2,0.267857,0.25,0.232143,0.25,2.0
3,0.358974,0.301282,0.320513,0.269231,3.0
4,0.349593,0.308943,0.317073,0.276423,4.0
...,...,...,...,...,...
119,0.367188,0.335938,0.335938,0.289062,119.0
120,0.32381,0.314286,0.266667,0.219048,120.0
121,0.324324,0.288288,0.297297,0.27027,121.0
122,0.328767,0.273973,0.294521,0.246575,122.0


In [198]:
rouge_extrac_df.to_csv('rouge_extrac_df.csv', index=False)

In [197]:
rouge_abstrac_df = pd.DataFrame(columns=['uni_precision', 'lcs_precision', 'match_id'])
uni_prec = rogue1_t5['precision']
lcs_prec = rogue_lcs_t5['precision']

for i in range(len(uni_prec)):
    rouge_abstrac_df = rouge_abstrac_df.append({"uni_precision": uni_prec[i], "lcs_precision": lcs_prec[i], 
                                            "match_id": rogue1_t5.index[i]}, ignore_index=True)
rouge_abstrac_df

Unnamed: 0,uni_precision,lcs_precision,match_id
0,0.31,0.28,0.0
1,0.34,0.29,1.0
2,0.241071,0.214286,2.0
3,0.237179,0.185897,3.0
4,0.243902,0.219512,4.0
...,...,...,...
119,0.328125,0.289062,119.0
120,0.27619,0.219048,120.0
121,0.234234,0.216216,121.0
122,0.267123,0.253425,122.0


In [199]:
rouge_abstrac_df.to_csv('rouge_abstrac_df.csv', index=False)

In [200]:
rouge_extrac_df1 = pd.melt(rouge_extrac_df,id_vars=['match_id'],value_name="score_type",)

In [201]:
rouge_extrac_df1

Unnamed: 0,match_id,variable,score_type
0,0.0,spacy_uni_precision,0.34
1,1.0,spacy_uni_precision,0.53
2,2.0,spacy_uni_precision,0.267857
3,3.0,spacy_uni_precision,0.358974
4,4.0,spacy_uni_precision,0.349593
...,...,...,...
491,119.0,nltk_lcs_precision,0.289062
492,120.0,nltk_lcs_precision,0.219048
493,121.0,nltk_lcs_precision,0.27027
494,122.0,nltk_lcs_precision,0.246575


In [203]:
import altair as alt

In [216]:
rouge_extrac_df2 = rouge_extrac_df1[((rouge_extrac_df1["variable"] == "spacy_uni_precision")) | ((rouge_extrac_df1["variable"] == "nltk_uni_precision"))]

In [232]:
click = alt.selection_multi(encodings=['color'])
timeunit='date'

fig4 = (
        alt.Chart(rouge_extrac_df2)
        .mark_point(width=1)
        .encode(
            x = alt.X("match_id", title="Match ID"),
            y=alt.Y("score_type", title="Precision",scale=alt.Scale(domain=[0, 0.6])),
            color = alt.Color("variable", scale = alt.Scale(scheme = 'dark2'),title = "Unigram models"),
            tooltip=[alt.Tooltip('variable')]
        )
    ).properties(width=alt.Step(30),title={
      "text": ["Extractive Summarization Performance"], 
      "subtitle": ["Plot of precision values of Unigram"]
    },).interactive()





fig4 = alt.layer(
    fig4
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
)
#fig4.save("plots/fig4.html")
fig4

In [220]:
rouge_extrac_df3 = rouge_extrac_df1[((rouge_extrac_df1["variable"] == "spacy_lcs_precision")) | ((rouge_extrac_df1["variable"] == "nltk_lcs_precision"))]

In [234]:
click = alt.selection_multi(encodings=['color'])
timeunit='date'

fig4 = (
        alt.Chart(rouge_extrac_df3)
        .mark_point(width=1)
        .encode(
            x = alt.X("match_id", title="Match ID"),
            y=alt.Y("score_type", title="Precision",scale=alt.Scale(domain=[0, 0.6])),
            color = alt.Color("variable", scale = alt.Scale(scheme = 'dark2'),title = "LCS models"),
            tooltip=[alt.Tooltip('variable')]
        )
    ).properties(width=alt.Step(30),title={
      "text": ["Extractive Summarization Performance"], 
      "subtitle": ["Plot of precision values of LCS"]
    },).interactive()





fig4 = alt.layer(
    fig4
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
)
#fig4.save("plots/fig4.html")
fig4

In [222]:
rouge_abstrac_df1 = pd.melt(rouge_abstrac_df,id_vars=['match_id'],value_name="score_type")

In [223]:
rouge_abstrac_df1

Unnamed: 0,match_id,variable,score_type
0,0.0,uni_precision,0.31
1,1.0,uni_precision,0.34
2,2.0,uni_precision,0.241071
3,3.0,uni_precision,0.237179
4,4.0,uni_precision,0.243902
...,...,...,...
243,119.0,lcs_precision,0.289062
244,120.0,lcs_precision,0.219048
245,121.0,lcs_precision,0.216216
246,122.0,lcs_precision,0.253425


In [235]:
click = alt.selection_multi(encodings=['color'])
timeunit='date'

fig4 = (
        alt.Chart(rouge_abstrac_df1)
        .mark_point(width=1)
        .encode(
            x = alt.X("match_id", title="Match ID"),
            y=alt.Y("score_type", title="Precision",scale=alt.Scale(domain=[0, 0.6])),
            color = alt.Color("variable", scale = alt.Scale(scheme = 'dark2'),title = "T5 models"),
            tooltip=[alt.Tooltip('variable')]
        )
    ).properties(width=alt.Step(30),title={
      "text": ["Abstractive Summarization Performance"], 
      "subtitle": ["Plot of precision values of different metrics"]
    },).interactive()





fig4 = alt.layer(
    fig4
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
)
#fig4.save("plots/fig4.html")
fig4