# Importing the libraries

In [16]:
import pandas as pd
import numpy as np

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.translate import bleu
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

import openai

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Import the dataset

In [2]:
comm_df = pd.read_csv('../../data/commentory_matchid.csv')
comm_df = comm_df[comm_df['match_id'] != 95]
comm_df.head()

Unnamed: 0,time,comment,event,event_player,event_team,comment_desc,home_team,home_team_abbr,away_team,away_team_abbr,full_time_score,match,date,link,match_id
0,,thanks for joining our commentary this evenin...,,,,full time summary,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
1,,barcelona are next in action at home in lalig...,,,,full time summary,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
2,,bayern munich have eased past barcelona in th...,,,,full time summary,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
3,90 + 2,full-time: barcelona 0-3 bayern munich,,,,timer,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
4,90,there will be two minutes of added time.,,,,timer,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0


# Use sliding window technique with 15 mins buffer for the match live ticker

In [30]:
def window_df(df, start_timer, end_timer):
    # Convert time column to str
    df['time'] = df['time'].astype(str)
    comments = []
    for i in range(df.shape[0]):
        time = df['time'][i]
        if time != 'nan':
            if '+' in time:
                time = time[:2]
            if int(time) >= start_timer and int(time) < end_timer:
                if df['comment_desc'][i] == 'timer':
                    comments.append(df['comment'][i])
    return " ".join(comments)

In [31]:
def create_window(df):
    all_comm = []
    match_ids = df['match_id'].unique()
    for id in match_ids:
        # Filter the dataframe w.r.t match_id
        match_df = df[df['match_id'] == id]
        match_df.reset_index(inplace = True, drop = True)

        # Divide the dataframe into 6 separate dfs, each corresponding to 15 minutes of the match.
        comm_15 = window_df(match_df, 0, 16)
        comm_30 = window_df(match_df, 16, 31)
        comm_45 = window_df(match_df, 31, 46)
        comm_60 = window_df(match_df, 46, 61)
        comm_75 = window_df(match_df, 61, 76)
        comm_90 = window_df(match_df, 76, 91)

        # Append the respective live tickers to a list
        all_comm.append([comm_15, comm_30, comm_45, comm_60, comm_75, comm_90])
    
    return all_comm

In [32]:
commentaries = create_window(comm_df)

In [33]:
# Will be used later for summaries
timer_list= ['[1-15]', '[16-30]', '[31-45]', '[46-60]', '[61-75]', '[76-90]']

# Abstractive text summarization

## Using GPT-3

In [17]:
def summarize_text_gpt(corpus, org_key, api_key):
    openai.organization = org_key
    openai.api_key = api_key
    engine_list = openai.Engine.list() # calling the engines available from the openai api 

    response = openai.Completion.create(engine="davinci",prompt=corpus,temperature=0.3,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n"]
        )
    return response["choices"][0]["text"]

In [18]:
spacy_txt = 'lewandowski combines with goretzka on the edge of the box, but the midfielder strikes his effort straight at araujo, and the ball deflects into the hands of ter stegen.    sergi roberto tries to break down the right flank to latch on to a long ball forward from alba, but pressure from davies forces the goal-kick.    garcia catches lewandowski late on the edge of the box and gives away a free-kick in a dangerous position.    goretzka wins the ball in the middle of the park and offloads towards sane, but his pass to muller in the left-inside channel is poor and cut out by araujo.    goretzka moves forward from the middle of the park and tries to tease a ball into the path of muller, but pique makes the interception.    the forward opens up space for a strike in the right-inside channel, but pique slides in with a low block before the home side scramble the loose ball away from danger.    sane completes a short offload to muller in space on the edge of the box and the forward does the rest as his deflected strike beats ter stegen.   goallllllllllllllllllll!!!!  half-time: barcelona 0-1 bayern munich   alba breaks down the left flank once more and angles a cross towards frenkie de jong. bayern work the ball into the final third and muller resists the opening for a strike and goes for a pass towards lewandowski. the german side double their advantage as lewandowski is alert from close range to poke the ball home after musiala strikes the post. bayern play their way through barca in the middle of the park before musiala latches on to a blocked strike. he hammers an effort towards goal that clatters off the right post. lewandowski is first to react and stretches out his left boot to knock into an empty net.    after breaking free of garcia, he fires low and hard on his left foot, but the barca keeper makes an excellent reaction stop.   wide! coutinho shows intent for the home side as he cuts inside from the right flank and blasts a strike towards the top-right corner that sails just over the bar. he slips as he strikes at goal from the right-inside channel, but ter stegen gets down to save his effort with ease.    musiala leaves the field after an encouraging outing for the visitors. lewandowski pounces on another rebound to fire home a third bayern goal after gnabry strikes the post. gnabry goes for a strike from distance after being teed up by kimmich, but he angles his effort over the bar.   goallllllllllllllllll!!!!! balde makes another drive into bayern territory and fires a low ball into the middle.'

In [19]:
# reading keys from file

api = pd.read_csv('../../../OpenAI_aa.txt')

api_key = api["Key"][0]
org_key = api["Key"][1]
gpt_summ = summarize_text_gpt(spacy_txt, org_key, api_key)
gpt_summ

RateLimitError: You exceeded your current quota, please check your plan and billing details.

In [4]:
def get_all_comm_gpt(comms, org_key, api_key):
    all_comm_gpt = []
    for commentary in comms:
        gpt_window_comm = []
        for comment in commentary:
            gpt_window_comm.append(summarize_text_gpt(comment, org_key, api_key))
        all_comm_gpt.append(" ".join(gpt_window_comm))
    return all_comm_gpt

In [127]:
# reading keys from file

api = pd.read_csv('../../../OpenAI.txt')

api_key = api["Key"][0]
org_key = api["Key"][1]
gpt_comm = get_all_comm_gpt(commentaries, org_key, api_key)

## Using BART

In [None]:
from transformers import BartForConditionalGeneration, AutoTokenizer
model_ckpt = "sshleifer/distilbart-cnn-6-6"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = BartForConditionalGeneration.from_pretrained(model_ckpt)

## Using T5

In [10]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp310-cp310-macosx_11_0_arm64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.98


In [22]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelWithLMHead

def summarize_text_t5(corpus):
    # T5 model name
    # T5_PATH = 't5-large' 

    # initialize the model architecture and weights
    # t5_model = T5ForConditionalGeneration.from_pretrained(T5_PATH)
    # initialize the model tokenizer
    # t5_tokenizer = T5Tokenizer.from_pretrained(T5_PATH)

    t5_tokenizer=AutoTokenizer.from_pretrained('T5-large')
    t5_model=AutoModelWithLMHead.from_pretrained('T5-large', return_dict=True)

    # encode the text into tensor of integers using the tokenizer
    inputs = t5_tokenizer.encode("summarize: " + corpus, return_tensors= 'pt', max_length=512, padding='max_length', truncation=True)
    summary_ids = t5_model.generate(inputs, num_beams=int(2), no_repeat_ngram_size=3, length_penalty=2.0,
                                    min_length=50, max_length=100, early_stopping=True)
    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [17]:
# import torch
# from transformers import AutoTokenizer, AutoModelWithLMHead

# def summarize_text_t5(corpus):
#     tokenizer=AutoTokenizer.from_pretrained('T5-base')
#     model=AutoModelWithLMHead.from_pretrained('T5-base', return_dict=True)

#     inputs=tokenizer.encode("sumarize: " + corpus, return_tensors='pt', max_length=4000, truncation=True)
#     output = model.generate(inputs, min_length=80, max_length=100)

#     return tokenizer.decode(output[0])

In [23]:
def get_all_comm_t5(comms):
    all_comm_t5 = []
    for commentary in comms:
        t5_window_comm = []
        for comment in commentary:
            t5_window_comm.append(summarize_text_t5(comment))
        all_comm_t5.append(" ".join(t5_window_comm))
    return all_comm_t5

In [24]:
t5_comm = get_all_comm_t5(commentaries)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
# from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
# from sklearn.model_selection import train_test_split

# train_dataset, eval_dataset = train_test_split(commentaries, test_size=0.2, random_state=42)


# tokenizer = T5Tokenizer.from_pretrained('t5-small')
# model = T5ForConditionalGeneration.from_pretrained('t5-small')

# train_dataset = commentaries(tokenizer, 'train')
# eval_dataset = commentaries(tokenizer, 'validation')

# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=1,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     eval_steps=10,
#     save_steps=10,
#     warmup_steps=500,
#     logging_dir='./logs',
#     logging_steps=10,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
# )

# trainer.train()


# Get full time summary from comment description

In [37]:
def get_full_match_summ(df):
    all_ft_comm = []
    match_ids = df['match_id'].unique()
    for id in match_ids:
        # Filter the dataframe w.r.t match_id
        match_df = df[df['match_id'] == id]
        all_ft_comm.append(" ".join(match_df[match_df['comment_desc'] == 'full time summary']['comment']))
    
    return all_ft_comm

In [38]:
all_ft_comm = get_full_match_summ(comm_df)

# ROGUE Score

In [41]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [46]:
from rouge import Rouge
rouge = Rouge()

all_rouge_spacy = []
for i in range(len(spacy_comm)):
  all_rouge_spacy.append(rouge.get_scores(all_ft_comm[i], spacy_comm[i]))
all_rouge_spacy
# sorted(all_rouge_spacy, reverse=True)

[[{'rouge-1': {'r': 0.1717171717171717, 'p': 0.34, 'f': 0.22818791500382868},
   'rouge-2': {'r': 0.03234501347708895,
    'p': 0.0975609756097561,
    'f': 0.04858299221155924},
   'rouge-l': {'r': 0.16161616161616163,
    'p': 0.32,
    'f': 0.21476509621188247}}],
 [{'rouge-1': {'r': 0.18213058419243985, 'p': 0.53, 'f': 0.2710997404386418},
   'rouge-2': {'r': 0.029465930018416207,
    'p': 0.11428571428571428,
    'f': 0.046852119727582235},
   'rouge-l': {'r': 0.15120274914089346,
    'p': 0.44,
    'f': 0.22506393481204343}}],
 [{'rouge-1': {'r': 0.1595744680851064,
    'p': 0.26785714285714285,
    'f': 0.19999999532088902},
   'rouge-2': {'r': 0.01904761904761905,
    'p': 0.0392156862745098,
    'f': 0.025641021240138822},
   'rouge-l': {'r': 0.13829787234042554,
    'p': 0.23214285714285715,
    'f': 0.17333332865422238}}],
 [{'rouge-1': {'r': 0.20363636363636364,
    'p': 0.358974358974359,
    'f': 0.2598607842442709},
   'rouge-2': {'r': 0.042643923240938165,
    'p': 0.08

In [56]:
all_rouge_nltk = []
for i in range(len(nltk_comm)):
  all_rouge_nltk.append(rouge.get_scores(all_ft_comm[i], nltk_comm[i]))
all_rouge_nltk
# sorted(all_rouge_spacy, reverse=True)

[[{'rouge-1': {'r': 0.18478260869565216, 'p': 0.34, 'f': 0.23943661515572312},
   'rouge-2': {'r': 0.029154518950437316,
    'p': 0.08130081300813008,
    'f': 0.0429184510500289},
   'rouge-l': {'r': 0.1793478260869565, 'p': 0.33, 'f': 0.2323943616345964}}],
 [{'rouge-1': {'r': 0.18840579710144928, 'p': 0.39, 'f': 0.25407165684516547},
   'rouge-2': {'r': 0.03206997084548105,
    'p': 0.07857142857142857,
    'f': 0.045548650127524605},
   'rouge-l': {'r': 0.15458937198067632,
    'p': 0.32,
    'f': 0.20846905098197338}}],
 [{'rouge-1': {'r': 0.19047619047619047, 'p': 0.25, 'f': 0.21621621130752386},
   'rouge-2': {'r': 0.015444015444015444,
    'p': 0.026143790849673203,
    'f': 0.01941747105912545},
   'rouge-l': {'r': 0.19047619047619047,
    'p': 0.25,
    'f': 0.21621621130752386}}],
 [{'rouge-1': {'r': 0.22596153846153846,
    'p': 0.30128205128205127,
    'f': 0.2582417533437991},
   'rouge-2': {'r': 0.036585365853658534,
    'p': 0.053811659192825115,
    'f': 0.043557163965

In [50]:
all_rouge_spacy[0][0]['rouge-l']

{'r': 0.16161616161616163, 'p': 0.32, 'f': 0.21476509621188247}

# Using cosine-similarity for comparison

In [111]:
# Using CountVectorizer()
def calc_cos_sim_count_vec(ft_summary, my_summary):
    corpus = [ft_summary, my_summary]
    # Create the Document Term Matrix
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(corpus)

    # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, 
                    columns=count_vectorizer.vocabulary_.keys(), 
                    index=['ft_summary_org','ft_summary_crtd'])
    
    # Compute Cosine Similarity
    return cosine_similarity(df[0:1], df)

In [None]:
# # Using TF-IDF
# def calc_cos_sim_tfidf(ft_summary, my_summary):
#     corpus = [ft_summary, my_summary]
#     vectorizer = TfidfVectorizer()
#     trsfm=vectorizer.fit_transform(corpus)

#     # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
#     doc_term_matrix = trsfm.todense()
#     trsfm_df = pd.DataFrame(doc_term_matrix,
#                             columns=vectorizer.vocabulary_.keys(),
#                             index=['ft_summary_org','ft_summary_crtd'])

#     return cosine_similarity(trsfm[0:1], trsfm)

In [128]:
all_cos_sim_gpt = []
for i in range(len(gpt_comm)):
  all_cos_sim_gpt.append(calc_cos_sim_count_vec(all_ft_comm[i], gpt_comm[i])[0][1])

sorted(all_cos_sim_gpt, reverse=True)

[0.7805582765133168,
 0.7793807007557877,
 0.7498346066929102,
 0.7350252599077766,
 0.733295076872664,
 0.7326234187510295,
 0.7295254357866824,
 0.7244374585272145,
 0.7238546752295592,
 0.7185062434637455,
 0.7173526886292382,
 0.7134709309702383,
 0.7081025372306506,
 0.7004081361290111,
 0.694468252978679,
 0.6899261258746314,
 0.6878663589036689,
 0.6849982407218725,
 0.6820688790491405,
 0.68076699459638,
 0.6769297313285413,
 0.6757788326841103,
 0.6734069132937255,
 0.6705653508644213,
 0.6697050296231473,
 0.6668163087039578,
 0.6635484863515022,
 0.6624459950154835,
 0.6578147491726972,
 0.6541013977251863,
 0.6535060074474317,
 0.652829799344488,
 0.6522113450812324,
 0.652051893886822,
 0.6517222005908682,
 0.6507079136417975,
 0.6477143003843482,
 0.6469318921388305,
 0.646493770397001,
 0.6442805558018583,
 0.6434883239578081,
 0.6433552027174023,
 0.642617191876622,
 0.6419438537773124,
 0.6419019997175781,
 0.6379386360495407,
 0.6377878654091924,
 0.6354703911712765,
