# Importing the libraries

In [91]:
import pandas as pd
import numpy as np

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.translate import bleu
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

import openai

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Import the dataset

In [92]:
comm_df = pd.read_csv('../../data/commentory_matchid.csv')
comm_df = comm_df[comm_df['match_id'] != 95]
comm_df.head()

Unnamed: 0,time,comment,event,event_player,event_team,comment_desc,home_team,home_team_abbr,away_team,away_team_abbr,full_time_score,match,date,link,match_id
0,,thanks for joining our commentary this evenin...,,,,full time summary,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
1,,barcelona are next in action at home in lalig...,,,,full time summary,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
2,,bayern munich have eased past barcelona in th...,,,,full time summary,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
3,90 + 2,full-time: barcelona 0-3 bayern munich,,,,timer,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0
4,90,there will be two minutes of added time.,,,,timer,barcelona,BAR,bayern munchen,FCB,0 - 3,Barcelona vs Bayern Muenchen,09/14/21,https://www.goal.com/en/match/barcelona-vs-bay...,0


# Use sliding window technique with 15 mins buffer for the match live ticker

In [93]:
def window_df(df, start_timer, end_timer):
    # Convert time column to str
    df['time'] = df['time'].astype(str)
    comments = []
    for i in range(df.shape[0]):
        time = df['time'][i]
        if time != 'nan':
            if '+' in time:
                time = time[:2]
            if int(time) >= start_timer and int(time) < end_timer:
                if df['comment_desc'][i] == 'timer':
                    comments.append(df['comment'][i])
    return " ".join(comments)

In [94]:
def create_window(df):
    all_comm = []
    match_ids = df['match_id'].unique()
    for id in match_ids:
        # Filter the dataframe w.r.t match_id
        match_df = df[df['match_id'] == id]
        match_df.reset_index(inplace = True, drop = True)

        # Divide the dataframe into 6 separate dfs, each corresponding to 15 minutes of the match.
        comm_15 = window_df(match_df, 0, 16)
        comm_30 = window_df(match_df, 16, 31)
        comm_45 = window_df(match_df, 31, 46)
        comm_60 = window_df(match_df, 46, 61)
        comm_75 = window_df(match_df, 61, 76)
        comm_90 = window_df(match_df, 76, 91)

        # Append the respective live tickers to a list
        all_comm.append([comm_15, comm_30, comm_45, comm_60, comm_75, comm_90])
    
    return all_comm

In [95]:
commentaries = create_window(comm_df)

In [96]:
# Will be used later for summaries
timer_list= ['[1-15]', '[16-30]', '[31-45]', '[46-60]', '[61-75]', '[76-90]']

# Using SpaCy to summarize text

In [97]:
nlp = spacy.load("en_core_web_sm")

def get_spacy_summary(text):
    doc = nlp(text)

    keyword = []
    stopwords = list(STOP_WORDS)
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
    for token in doc:
        if (token.text in stopwords or token.text in punctuation):
            continue
        if token.pos_ in pos_tag:
            keyword.append(token.text)

    freq_word = Counter(keyword)
    max_freq = Counter(keyword).most_common(1)[0][1]
    for word in freq_word.keys():
        freq_word[word] = (freq_word[word]/max_freq)

    sent_strenght = {}
    for sent in doc.sents:
        for word in sent:
            if word.text in freq_word.keys():
                if sent in sent_strenght.keys():
                    sent_strenght[sent] += freq_word[word.text]
                else:
                    sent_strenght[sent] = freq_word[word.text]

    summarized_sentences = nlargest(3, sent_strenght, key=sent_strenght.get)
    final_sentences = [w.text for w in summarized_sentences]
    return " ".join(final_sentences)

In [98]:
def get_all_comm_spacy(comms):
    all_comm_spacy = []
    for commentary in comms:
        spacy_window_comm = []
        for comment in commentary:
            spacy_window_comm.append(get_spacy_summary(comment))
        all_comm_spacy.append(" ".join(spacy_window_comm))
    return all_comm_spacy

In [99]:
spacy_comm = get_all_comm_spacy(commentaries)

# Using NLTK for text summarization

In [100]:
def remove_punct(text):
    """ A method to remove punctuations from text """
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text) #removes numbers from text
    return text

In [101]:
def tokenization(text):
    """ A method to tokenize text data """
    text = re.split('\W+', text) #splitting each sentence/ tweet into its individual words
    return text

In [102]:
def summarize_text_nltk(text, num_sentences=3):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize the sentences into words and remove stopwords
    
    text_punct_removed = remove_punct(text)
    words = tokenization(text_punct_removed.lower())
    
    # words = word_tokenize(text)

    # remove stopwords
    stop_words =  set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    # Apply stemming to the filtered words
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    # Lemmatize words
    word_net_lemma = nltk.WordNetLemmatizer()
    word_lemma = [word_net_lemma.lemmatize(word) for word in stemmed_words]
    
    # Calculate word frequency and sentence scores
    word_freq = nltk.FreqDist(stemmed_words)
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_freq:
                if len(sentence.split()) < 30:
                    if i not in sentence_scores:
                        sentence_scores[i] = word_freq[word]
                    else:
                        sentence_scores[i] += word_freq[word]
    
    # Select the top sentences based on their scores
    summary_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    summary = [sentences[i] for i in sorted(summary_sentences)]
    return " ".join(summary)

In [103]:
def get_all_comm_nltk(comms):
    all_comm_nltk = []
    for commentary in comms:
        nltk_window_comm = []
        for comment in commentary:
            nltk_window_comm.append(summarize_text_nltk(comment))
        all_comm_nltk.append(" ".join(nltk_window_comm))
    return all_comm_nltk

In [104]:
nltk_comm = get_all_comm_nltk(commentaries)

# Using GPT-3

In [122]:
def summarize_text_gpt(corpus, org_key, api_key):
    openai.organization = org_key
    openai.api_key = api_key
    engine_list = openai.Engine.list() # calling the engines available from the openai api 

    response = openai.Completion.create(engine="davinci",prompt=corpus,temperature=0.3,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n"]
        )
    return response["choices"][0]["text"]

In [123]:
def get_all_comm_gpt(comms, org_key, api_key):
    all_comm_gpt = []
    for commentary in comms:
        gpt_window_comm = []
        for comment in commentary:
            gpt_window_comm.append(summarize_text_gpt(comment, org_key, api_key))
        all_comm_gpt.append(" ".join(gpt_window_comm))
    return all_comm_gpt

In [127]:
# reading keys from file

api = pd.read_csv('../../../OpenAI.txt')

api_key = api["Key"][0]
org_key = api["Key"][1]
gpt_comm = get_all_comm_gpt(commentaries, org_key, api_key)

# Get full time summary from comment description

In [105]:
def get_full_match_summ(df):
    all_ft_comm = []
    match_ids = df['match_id'].unique()
    for id in match_ids:
        # Filter the dataframe w.r.t match_id
        match_df = df[df['match_id'] == id]
        all_ft_comm.append(" ".join(match_df[match_df['comment_desc'] == 'full time summary']['comment']))
    
    return all_ft_comm

In [109]:
all_ft_comm = get_full_match_summ(comm_df)

# Using BLEU score for checking similarity

In [106]:
# def calc_bleu(ft_summary, my_summary):
#     return bleu([ft_summary.split()], my_summary.split())

# Using cosine-similarity for comparison

In [111]:
# Using CountVectorizer()
def calc_cos_sim_count_vec(ft_summary, my_summary):
    corpus = [ft_summary, my_summary]
    # Create the Document Term Matrix
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(corpus)

    # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, 
                    columns=count_vectorizer.vocabulary_.keys(), 
                    index=['ft_summary_org','ft_summary_crtd'])
    
    # Compute Cosine Similarity
    return cosine_similarity(df[0:1], df)

In [None]:
# # Using TF-IDF
# def calc_cos_sim_tfidf(ft_summary, my_summary):
#     corpus = [ft_summary, my_summary]
#     vectorizer = TfidfVectorizer()
#     trsfm=vectorizer.fit_transform(corpus)

#     # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
#     doc_term_matrix = trsfm.todense()
#     trsfm_df = pd.DataFrame(doc_term_matrix,
#                             columns=vectorizer.vocabulary_.keys(),
#                             index=['ft_summary_org','ft_summary_crtd'])

#     return cosine_similarity(trsfm[0:1], trsfm)

In [113]:
all_cos_sim_spacy = []
for i in range(len(spacy_comm)):
  all_cos_sim_spacy.append(calc_cos_sim_count_vec(all_ft_comm[i], spacy_comm[i])[0][1])

sorted(all_cos_sim_spacy, reverse=True)

[0.8232871328363739,
 0.8174248588298362,
 0.8145149399112754,
 0.7816383863403364,
 0.7707412659871002,
 0.7694462678570619,
 0.7659244889204044,
 0.7649044889725534,
 0.7634486278999365,
 0.7582895374148945,
 0.7518359657137371,
 0.7472140658882348,
 0.7429968507127381,
 0.7387260114298334,
 0.7347287011121258,
 0.7334503928271484,
 0.7310951615491647,
 0.7299371348265824,
 0.7296285367600028,
 0.7280359685613873,
 0.7261173975565213,
 0.7229427592932066,
 0.7208075167278749,
 0.7200696963190241,
 0.7195155789335859,
 0.7192314578474367,
 0.7184730379535103,
 0.7176907877381489,
 0.7174915295770368,
 0.7168908512875278,
 0.7164631047340766,
 0.7130990002287594,
 0.708139866273981,
 0.7077639126053901,
 0.7061962925226118,
 0.7060672642464483,
 0.7041160601434504,
 0.7036982541979113,
 0.7008160444039738,
 0.6982421515857751,
 0.6980133197158735,
 0.6949830538115703,
 0.6935688420806261,
 0.6921769122637016,
 0.6906989123143691,
 0.6892093880551476,
 0.6889921695571921,
 0.68834054212

In [114]:
all_cos_sim_nltk = []
for i in range(len(nltk_comm)):
  all_cos_sim_nltk.append(calc_cos_sim_count_vec(all_ft_comm[i], nltk_comm[i])[0][1])

sorted(all_cos_sim_nltk, reverse=True)

[0.7953239957993086,
 0.784270383288959,
 0.7728546233270182,
 0.7692639306166029,
 0.7603813000949753,
 0.7593104742149936,
 0.7581093603642679,
 0.7537184963392365,
 0.749825777576924,
 0.7475982045617435,
 0.7405160279629593,
 0.7391730797696568,
 0.7327699869772225,
 0.7243865317669208,
 0.7227952225480624,
 0.7211585276842527,
 0.7210127545462429,
 0.7193756582020947,
 0.7188542025873198,
 0.7182557335991091,
 0.7175172291598655,
 0.705042436618654,
 0.704912898855007,
 0.7020957136137166,
 0.7000223391806029,
 0.699979695800079,
 0.6937673595094522,
 0.6910079483143473,
 0.6902610403846762,
 0.6843671563031595,
 0.6830959423188732,
 0.6827604968028924,
 0.6824310730523295,
 0.6812806487187413,
 0.6792623250621805,
 0.6737007375350063,
 0.6720868312538996,
 0.6710502891129948,
 0.6705873056132569,
 0.6701009371571429,
 0.6671667649705758,
 0.6663370783926228,
 0.6653998750443286,
 0.6650983085792687,
 0.6650702246545304,
 0.6640285896984353,
 0.6625617768310097,
 0.662243488322648

In [128]:
all_cos_sim_gpt = []
for i in range(len(gpt_comm)):
  all_cos_sim_gpt.append(calc_cos_sim_count_vec(all_ft_comm[i], gpt_comm[i])[0][1])

sorted(all_cos_sim_gpt, reverse=True)

[0.7805582765133168,
 0.7793807007557877,
 0.7498346066929102,
 0.7350252599077766,
 0.733295076872664,
 0.7326234187510295,
 0.7295254357866824,
 0.7244374585272145,
 0.7238546752295592,
 0.7185062434637455,
 0.7173526886292382,
 0.7134709309702383,
 0.7081025372306506,
 0.7004081361290111,
 0.694468252978679,
 0.6899261258746314,
 0.6878663589036689,
 0.6849982407218725,
 0.6820688790491405,
 0.68076699459638,
 0.6769297313285413,
 0.6757788326841103,
 0.6734069132937255,
 0.6705653508644213,
 0.6697050296231473,
 0.6668163087039578,
 0.6635484863515022,
 0.6624459950154835,
 0.6578147491726972,
 0.6541013977251863,
 0.6535060074474317,
 0.652829799344488,
 0.6522113450812324,
 0.652051893886822,
 0.6517222005908682,
 0.6507079136417975,
 0.6477143003843482,
 0.6469318921388305,
 0.646493770397001,
 0.6442805558018583,
 0.6434883239578081,
 0.6433552027174023,
 0.642617191876622,
 0.6419438537773124,
 0.6419019997175781,
 0.6379386360495407,
 0.6377878654091924,
 0.6354703911712765,
