In [77]:
import pandas as pd
import numpy as np

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.translate import bleu
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

import openai

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
temp_df = pd.read_csv('../../data/temp_data.csv')
temp_df.head()

Unnamed: 0,time,comment,event,event_player,event_team,comment_desc
0,,We hope you have enjoyed our live coverage of...,,,,full time summary
1,,While Madrid's Champions League defence conti...,,,,full time summary
2,,"It's all over at Stamford Bridge, and it's an...",,,,full time summary
3,90 + 3,FULL-TIME: CHELSEA 0-2 REAL MADRID,,,,timer
4,90 + 3,"Valverde sweeps a pass out to Ceballos, who c...",,,,timer


# Use sliding window technique with 15 mins buffer for the match live ticker

In [10]:
def window_df(df, start_timer, end_timer):
    comments = []
    for i in range(len(df)):
        time = df['time'][i]
        if time != 'nan':
            if '+' in time:
                time = time[:2]
            if int(time) >= start_timer and int(time) < end_timer:
                if df['comment_desc'][i] == 'timer':
                    comments.append(df['comment'][i])
    return " ".join(comments)

In [11]:
# Convert time column to str
temp_df['time'] = temp_df['time'].astype(str)

# Divide the dataframe into 6 separate dfs, each corresponding to 15 minutes of the match.
comm_15 = window_df(temp_df, 0, 16)
comm_30 = window_df(temp_df, 16, 31)
comm_45 = window_df(temp_df, 31, 46)
comm_60 = window_df(temp_df, 46, 61)
comm_75 = window_df(temp_df, 61, 76)
comm_90 = window_df(temp_df, 76, 91)

# Append the respective live tickers to a list
window_comments = [comm_15, comm_30, comm_45, comm_60, comm_75, comm_90]

In [22]:
# Will be used later for summaries
timer_list= ['[1-15]', '[16-30]', '[31-45]', '[46-60]', '[61-75]', '[76-90]']

# Using SpaCy to summarize text

## Approach 1

In [12]:
nlp = spacy.load("en_core_web_sm")

def get_spacy_summary(text):
    doc = nlp(text)

    keyword = []
    stopwords = list(STOP_WORDS)
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
    for token in doc:
        if (token.text in stopwords or token.text in punctuation):
            continue
        if token.pos_ in pos_tag:
            keyword.append(token.text)

    freq_word = Counter(keyword)

    max_freq = Counter(keyword).most_common(1)[0][1]
    for word in freq_word.keys():
        freq_word[word] = (freq_word[word]/max_freq)

    sent_strenght = {}
    for sent in doc.sents:
        for word in sent:
            if word.text in freq_word.keys():
                if sent in sent_strenght.keys():
                    sent_strenght[sent] += freq_word[word.text]
                else:
                    sent_strenght[sent] = freq_word[word.text]

    summarized_sentences = nlargest(3, sent_strenght, key=sent_strenght.get)
    final_sentences = [w.text for w in summarized_sentences]
    return " ".join(final_sentences)

## Approach 2

In [18]:
def summarize_text_spacy(text, num_sentences=3):
    doc = nlp(text)
    sentences = [sent for sent in doc.sents]
    sentence_scores = {}
    for i, sent in enumerate(sentences):
        sentence_scores[i] = sent.similarity(doc)
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    summary = [sentences[i].text.strip() for i in top_sentences]
    return " ".join(summary)

# Using NLTK for text summarization

In [14]:
def remove_punct(text):
    """ A method to remove punctuations from text """
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text) #removes numbers from text
    return text

In [15]:
def tokenization(text):
    """ A method to tokenize text data """
    text = re.split('\W+', text) #splitting each sentence/ tweet into its individual words
    return text

In [17]:
def summarize_text_nltk(text, num_sentences=3):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize the sentences into words and remove stopwords
    
    text_punct_removed = remove_punct(text)
    words = tokenization(text_punct_removed.lower())
    
    # words = word_tokenize(text)

    # remove stopwords
    stop_words =  set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    # Apply stemming to the filtered words
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    # Lemmatize words
    word_net_lemma = nltk.WordNetLemmatizer()
    word_lemma = [word_net_lemma.lemmatize(word) for word in stemmed_words]
    
    # Calculate word frequency and sentence scores
    word_freq = nltk.FreqDist(stemmed_words)
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_freq:
                if len(sentence.split()) < 30:
                    if i not in sentence_scores:
                        sentence_scores[i] = word_freq[word]
                    else:
                        sentence_scores[i] += word_freq[word]
    
    # Select the top sentences based on their scores
    summary_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    summary = [sentences[i] for i in sorted(summary_sentences)]
    return " ".join(summary)

# Using GPT-3

In [96]:
def summarize_text_gpt(corpus, org_key, api_key):
    openai.organization = org_key
    openai.api_key = api_key
    engine_list = openai.Engine.list() # calling the engines available from the openai api 
    

    response = openai.Completion.create(engine="text-davinci-003",prompt=corpus,temperature=0.3,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n"]
        )
    return response["choices"][0]["text"]

In [None]:
# reading keys from file

api = pd.read_csv('../../../OpenAI.txt')

api_key = api["Key"][0]
org_key = api["Key"][1]
gpt_comm = summarize_text_gpt(window_comments, org_key, api_key)
gpt_comm

# Summarize the data

## Using SpaCy (both approaches)

In [24]:
# SpaCy approach 1
spacy_window_summaries1 = []
for comment  in window_comments:
    spacy_window_summaries1.append(get_spacy_summary(comment))

spacy_window_summaries1

["A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive. Chelsea continue to press as James' cross runs all the way through to Kovacic on the edge of the area, only for Militao to block his shot at close quarters. Chelsea have progressed from five of their last seven Champions League ties when losing the first leg away from home, and they have also qualified from their last two ties when losing the opener by two clear goals (vs Napoli in 2011-12 and Paris Saint-Germain in 2013-14).",
 "Militao's yellow card means he will miss the first leg of any semi-final tie through suspension, if Madrid make it…   Militao appears to be in some discomfort following that challenge on Gallagher, having kicked the bottom of the Chelsea man's boot as he flew in at a dangerous height. Rodrygo finds space on the right-hand side of the area after linking up with Carvajal, and the Brazilian youngster opts to

In [52]:
spacy_time_window1 = []
for i in range(len(spacy_window_summaries1)):
    spacy_time_window1.append(timer_list[i] + ' ' +  spacy_window_summaries1[i])

spacy_window_summ1 = " ".join(spacy_window_summaries1)
spacy_window_summ1

"A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive. Chelsea continue to press as James' cross runs all the way through to Kovacic on the edge of the area, only for Militao to block his shot at close quarters. Chelsea have progressed from five of their last seven Champions League ties when losing the first leg away from home, and they have also qualified from their last two ties when losing the opener by two clear goals (vs Napoli in 2011-12 and Paris Saint-Germain in 2013-14). Militao's yellow card means he will miss the first leg of any semi-final tie through suspension, if Madrid make it…   Militao appears to be in some discomfort following that challenge on Gallagher, having kicked the bottom of the Chelsea man's boot as he flew in at a dangerous height. Rodrygo finds space on the right-hand side of the area after linking up with Carvajal, and the Brazilian youngster opts to go f

In [78]:
# SpaCy approach 2
spacy_window_summaries2 = []
for comment  in window_comments:
    spacy_window_summaries2.append(summarize_text_spacy(comment))

# spacy_time_window2 = []
# for i in range(len(spacy_window_summaries2)):
#     spacy_time_window2.append(timer_list[i] + ' ' +  spacy_window_summaries2[i])

spacy_window_summ2 = " ".join(spacy_window_summaries2)
spacy_window_summ2

"A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive. Chelsea continue to press as James' cross runs all the way through to Kovacic on the edge of the area, only for Militao to block his shot at close quarters. Cucurella is on hand to intercept a slack ball from Valverde in midfield, and Chelsea can look to build an attack. Militao's yellow card means he will miss the first leg of any semi-final tie through suspension, if Madrid make it…   Militao appears to be in some discomfort following that challenge on Gallagher, having kicked the bottom of the Chelsea man's boot as he flew in at a dangerous height. A flowing Madrid move involving Benzema and Rodrygo sees Vinicius released into the left-hand channel, and the Brazilian cuts inside to shoot straight at Kepa. Rodrygo finds space on the right-hand side of the area after linking up with Carvajal, and the Brazilian youngster opts to go

## Using nltk

In [79]:
nltk_window_summaries = []
for comment  in window_comments:
    nltk_window_summaries.append(summarize_text_nltk(comment))

# nltk_time_window = []
# for i in range(len(nltk_window_summaries)):
#     nltk_time_window.append(timer_list[i] + ' ' +  nltk_window_summaries[i])

nltk_window_summ = " ".join(nltk_window_summaries)
nltk_window_summ

"A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive. Cucurella is on hand to intercept a slack ball from Valverde in midfield, and Chelsea can look to build an attack. Havertz looks to get Chelsea on the front foot straight away, winning a throw-in high on the left. Lampard's side have a spare man inside the Madrid area as Cucurella breaks down the left, but his delivery is overhit and it bypasses everyone. A flowing Madrid move involving Benzema and Rodrygo sees Vinicius released into the left-hand channel, and the Brazilian cuts inside to shoot straight at Kepa. The Madrid man is booked as Chelsea get a free-kick in a dangerous position, with Fernandez set to deliver.  HALF-TIME: CHELSEA 0-0 REAL MADRID   HUGE SAVE FROM COURTOIS! So much of Chelsea's attacking play has come down that flank, but wing-back James appeared to be limping a moment ago, which will be a big concern for Lam

## Using GPT-3

In [87]:
window_comments[0]

" Vinicius tracks back to battle with James, winning a goal-kick for his team. Chelsea are looking to find their right wing-back at every opportunity, and they are causing Madrid problems early on.   Handball! Chalobah steps into midfield and releases Havertz in behind, with the German reaching the byline. He stands up a decent cross towards James, but the referee penalises the Chelsea man as he attempts to control it inside the area.   Chelsea continue to press as James' cross runs all the way through to Kovacic on the edge of the area, only for Militao to block his shot at close quarters. This is a decent spell for the hosts!   WIDE! What a chance for Chelsea! Havertz battles for a cross and the ball drops to Kante 12 yards from goal, completely unmarked. The midfielder swings his left foot at the ball, but he completely scuffs his volley and it bounces wide. That's a real let-off for Madrid!   A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but K

In [99]:
gpt_window_summaries = []
for comment  in window_comments:
    gpt_summ = summarize_text_gpt(comment)
    gpt_window_summaries.append(gpt_summ)

gpt_window_summ = " ".join(gpt_window_summaries)
gpt_window_summ

"  Chelsea get the game underway, and they will be looking to overturn a 2-0 deficit in the second leg of this last-16 tie.   The teams are in the tunnel and ready to come out onto the pitch!   The players are out and ready for action, but there's a delay as the Champions League anthem is not played!   The teams are in the tunnel and ready to come out onto the pitch!   The players are out and ready to come out onto the pitch!   The teams are in the tunnel and ready to come out onto the pitch!   The players are out and ready to come out onto the pitch!   The teams are in the tunnel and ready to come out onto the pitch!   The players are out and ready to come out onto the pitch!   The teams are in the tunnel and ready to come out onto the pitch!   The players are out and ready to come out onto the pitch!   Chelsea are on the front foot early on, with Kovacic's deep cross from the right finding Benzema in the middle. The Frenchman's header is straight at Kepa, though, and the goalkeeper i

# Get full time summary from comment description

In [57]:
ft_comm = temp_df[temp_df['comment_desc'] == 'full time summary']['comment']
full_time_summary = " ".join(ft_comm)
full_time_summary

" We hope you have enjoyed our live coverage of yet another European win for Real Madrid. It's goodbye for now, and we'll see you next time!   While Madrid's Champions League defence continues apace, Chelsea have lost four successive matches since Frank Lampard returned to the club as interim boss. The Blues will now endure just their second trophyless season in the last seven campaigns, and with 17 points separating them from the Premier League's top four, it could be some time before Stamford Bridge hosts another Champions League fixture!   It's all over at Stamford Bridge, and it's another 2-0 win over Chelsea for Real Madrid – Los Blancos take the quarter-final tie 4-0 on aggregate! Chelsea performed admirably for long periods as Cucurella and Kante missed great chances to cut Madrid's lead, but they could not find a response when Rodrygo hammered Vinicius' cut-back in from close range after 58 minutes. A flowing move involving Valverde led to Rodrygo getting a second with 10 minut

# Using BLEU score for checking similarity

In [58]:
# def calc_bleu(ft_summary, my_summary):
#     return bleu([ft_summary.split()], my_summary.split())

# Using cosine-similarity for comparison

In [68]:
# Using CountVectorizer()
def calc_cos_sim_count_vec(ft_summary, my_summary):
    corpus = [ft_summary, my_summary]
    # Create the Document Term Matrix
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(corpus)

    # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, 
                    columns=count_vectorizer.vocabulary_.keys(), 
                    index=['ft_summary_org','ft_summary_crtd'])
    
    # Compute Cosine Similarity
    return cosine_similarity(df[0:1], df)

In [101]:
# # Using TF-IDF
# def calc_cos_sim_tfidf(ft_summary, my_summary):
#     corpus = [ft_summary, my_summary]
#     vectorizer = TfidfVectorizer()
#     trsfm=vectorizer.fit_transform(corpus)

#     # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
#     doc_term_matrix = trsfm.todense()
#     trsfm_df = pd.DataFrame(doc_term_matrix,
#                             columns=vectorizer.vocabulary_.keys(),
#                             index=['ft_summary_org','ft_summary_crtd'])

#     return cosine_similarity(trsfm[0:1], trsfm)

In [70]:
calc_cos_sim_count_vec(full_time_summary, spacy_window_summ1)

array([[1.        , 0.56385736]])

In [71]:
calc_cos_sim_count_vec(full_time_summary, spacy_window_summ2)

array([[1.        , 0.51980393]])

In [72]:
calc_cos_sim_count_vec(full_time_summary, nltk_window_summ)

array([[1.        , 0.56570489]])

In [100]:
calc_cos_sim_count_vec(full_time_summary, gpt_window_summ)

array([[1.        , 0.50421883]])