In [503]:
import pandas as pd
import numpy as np

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.translate import bleu
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

import openai

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [438]:
temp_df = pd.read_csv('../../data/temp_data.csv')
temp_df.head()

Unnamed: 0,time,comment,event,event_player,event_team,comment_desc
0,,We hope you have enjoyed our live coverage of...,,,,full time summary
1,,While Madrid's Champions League defence conti...,,,,full time summary
2,,"It's all over at Stamford Bridge, and it's an...",,,,full time summary
3,90 + 3,FULL-TIME: CHELSEA 0-2 REAL MADRID,,,,timer
4,90 + 3,"Valverde sweeps a pass out to Ceballos, who c...",,,,timer


# Cleaning the data

In [439]:
def remove_punct(text):
    """ A method to remove punctuations from text """
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text) #removes numbers from text
    return text

In [440]:
def remove_stopwords(text):
    """ A method to remove all the stopwords """
    stopwords = nltk.corpus.stopwords.words('english')
    text = [word for word in text if word not in stopwords]
    return text

In [441]:
def tokenization(text):
    """ A method to tokenize text data """
    text = re.split('\W+', text) #splitting each sentence/ tweet into its individual words
    return text

In [442]:
def stemming(text):
    """ A method to perform stemming on text data"""
    porter_stem = nltk.PorterStemmer()
    text = [porter_stem.stem(word) for word in text]
    return text

In [443]:
def lemmatizer(text):
    word_net_lemma = nltk.WordNetLemmatizer()
    text = [word_net_lemma.lemmatize(word) for word in text]
    return text

In [444]:
# Making a common cleaning function for every part below for code reproducability
def clean_text(list_words):
    # Making a regex pattern to match in the characters we would like to replace from the words
    character_replace = ",()0123456789.?!@#$%&;*:_,/" 
    pattern = "[" + character_replace + "]"
    # ------------------------------------------------------------------------------------

    # ------------------------------------------------------------------------------------
    new_list_words = []
    # Looping through every word to remove the characters and appending back to a new list
    # replace is being used for the characters that could not be catched through regex
    for s in list_words:
        new_word = s.lower()
        new_word = re.sub(pattern,"",new_word)
        new_word = new_word.replace('[', '')
        new_word = new_word.replace(']', '')
        new_word = new_word.replace('-', '')
        new_word = new_word.replace('—', '')
        new_word = new_word.replace('“', '')
        new_word = new_word.replace("’", '')
        new_word = new_word.replace("”", '')
        new_word = new_word.replace("‘", '')
        new_word = new_word.replace('"', '')
        new_word = new_word.replace("'", '')
        new_word = new_word.replace(" ", '')
        new_list_words.append(new_word)

    # Using filter to remove empty strings
    new_list_words = list(filter(None, new_list_words))
    return new_list_words

In [445]:
def clean(df, text_col):
    """ A method to do basic data cleaning """
    
    clean_data = df.copy()
    
    clean_data['clean_text']=clean_data[text_col].apply(lambda x: remove_punct(x))
    
    clean_data['text_tokenized'] = clean_data['clean_text'].apply(lambda x: tokenization(x.lower()))
    
    stopwords = nltk.corpus.stopwords.words('english')
    
    clean_data['text_without_stop'] = clean_data['text_tokenized'].apply(lambda x: remove_stopwords(x))    
    
    clean_data['text_stemmed'] = clean_data['text_without_stop'].apply(lambda x: stemming(x))
        
    clean_data['text_lemmatized'] = clean_data['text_without_stop'].apply(lambda x: lemmatizer(x))

    clean_data['text_final'] = clean_data['text_lemmatized'].apply(lambda x: clean_text(x))
    
    return clean_data

In [446]:
def clean(corpus):
    """ A method to do basic data cleaning """
    clean_data = pd.DataFrame(columns=['ini_text', 'clean_text', 'text_tokenized', 'text_without_stop', 
                                       'text_stemmed', 'text_lemmatized', 'text_final'])

    clean_data['ini_text'] = [corpus]

    clean_data['clean_text']=clean_data['ini_text'].apply(lambda x: remove_punct(x))
    
    clean_data['text_tokenized'] = clean_data['clean_text'].apply(lambda x: tokenization(x.lower()))
    
    stopwords = set(nltk.corpus.stopwords.words('english'))
    
    clean_data['text_without_stop'] = clean_data['text_tokenized'].apply(lambda x: remove_stopwords(x))    
    
    clean_data['text_stemmed'] = clean_data['text_without_stop'].apply(lambda x: stemming(x))
        
    clean_data['text_lemmatized'] = clean_data['text_without_stop'].apply(lambda x: lemmatizer(x))

    clean_data['text_final'] = clean_data['text_lemmatized'].apply(lambda x: clean_text(x))
    
    return clean_data

# Concat the live ticker for 'timer'

In [447]:
comment = ''
for comm in temp_df[temp_df['comment_desc'] == 'timer']['comment']:
    comment += comm

# Using SpaCy to summarize the text

## Approach 1

In [448]:
nlp = spacy.load("en_core_web_sm")

def get_spacy_summary(text):
    doc = nlp(text)

    keyword = []
    stopwords = list(STOP_WORDS)
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
    for token in doc:
        if (token.text in stopwords or token.text in punctuation):
            continue
        if token.pos_ in pos_tag:
            keyword.append(token.text)

    freq_word = Counter(keyword)

    max_freq = Counter(keyword).most_common(1)[0][1]
    for word in freq_word.keys():
        freq_word[word] = (freq_word[word]/max_freq)

    sent_strenght = {}
    for sent in doc.sents:
        for word in sent:
            if word.text in freq_word.keys():
                if sent in sent_strenght.keys():
                    sent_strenght[sent] += freq_word[word.text]
                else:
                    sent_strenght[sent] = freq_word[word.text]

    summarized_sentences = nlargest(3, sent_strenght, key=sent_strenght.get)
    final_sentences = [w.text for w in summarized_sentences]
    return " ".join(final_sentences)

## Approach 2

In [449]:
def summarize_text_spacy(text, num_sentences=3):
    doc = nlp(text)
    sentences = [sent for sent in doc.sents]
    sentence_scores = {}
    for i, sent in enumerate(sentences):
        sentence_scores[i] = sent.similarity(doc)
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    summary = [sentences[i].text.strip() for i in top_sentences]
    return " ".join(summary)

# Using nltk for text summarization

In [450]:
def summarize_text_nltk(text, num_sentences=3):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize the sentences into words and remove stopwords

    # stop_words = set(stopwords.words("english"))
    
    text_punct_removed = remove_punct(text)
    words = tokenization(text_punct_removed.lower())
    
    # words = word_tokenize(text)

    # remove stopwords
    stop_words =  set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    # Apply stemming to the filtered words
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    # Lemmatize words
    word_net_lemma = nltk.WordNetLemmatizer()
    word_lemma = [word_net_lemma.lemmatize(word) for word in stemmed_words]
    
    # Calculate word frequency and sentence scores
    word_freq = nltk.FreqDist(stemmed_words)
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_freq:
                if len(sentence.split()) < 30:
                    if i not in sentence_scores:
                        sentence_scores[i] = word_freq[word]
                    else:
                        sentence_scores[i] += word_freq[word]
    
    # Select the top sentences based on their scores
    summary_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    summary = [sentences[i] for i in sorted(summary_sentences)]
    return " ".join(summary)

# Using GPT-3

In [508]:
def summarize_text_gpt(corpus):
    openai.organization = 'org-Ik9BKGuegvZTXOQlzGjVSOxz'
    openai.api_key = "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"
    engine_list = openai.Engine.list() # calling the engines available from the openai api 
    

    response = openai.Completion.create(engine="text-davinci-003",prompt=corpus,temperature=0.3,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n"]
        )
    return response["choices"][0]["text"]

# Using Gensim

In [451]:
# In google colab

# Get full time summary using comment description from dataset

In [452]:
ft_comm = temp_df[temp_df['comment_desc'] == 'full time summary']['comment']
full_time_summary = " ".join(ft_comm)
full_time_summary

" We hope you have enjoyed our live coverage of yet another European win for Real Madrid. It's goodbye for now, and we'll see you next time!   While Madrid's Champions League defence continues apace, Chelsea have lost four successive matches since Frank Lampard returned to the club as interim boss. The Blues will now endure just their second trophyless season in the last seven campaigns, and with 17 points separating them from the Premier League's top four, it could be some time before Stamford Bridge hosts another Champions League fixture!   It's all over at Stamford Bridge, and it's another 2-0 win over Chelsea for Real Madrid – Los Blancos take the quarter-final tie 4-0 on aggregate! Chelsea performed admirably for long periods as Cucurella and Kante missed great chances to cut Madrid's lead, but they could not find a response when Rodrygo hammered Vinicius' cut-back in from close range after 58 minutes. A flowing move involving Valverde led to Rodrygo getting a second with 10 minut

# Summarize the live ticker

## Using SpaCy

In [453]:
# Approach 1
spacy_summary1 = get_spacy_summary(comment)
spacy_summary1

"A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive. Militao's yellow card means he will miss the first leg of any semi-final tie through suspension, if Madrid make it…  Militao appears to be in some discomfort following that challenge on Gallagher, having kicked the bottom of the Chelsea man's boot as he flew in at a dangerous height. Rodrygo is the target of a clever free-kick into the right-hand channel, but Chalobah and Silva combine to crowd the Brazilian out and win possession for Chelsea.  "

In [454]:
# Approach 2
spacy_summary2 = summarize_text_spacy(comment)
spacy_summary2

"Vinicius gets the assist for Rodrygo's goal, staying cool under pressure at the far post to pick out his opposite winger, who makes no mistake from close range!  GOAAAAAAAL! James works some space to hit a low cross into the six-yard box, but Alaba slides in to divert it away from the danger zone! A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive."

## Using nltk

In [455]:
nltk_summary = summarize_text_nltk(comment)
nltk_summary

' FULL-TIME: CHELSEA 0-2 REAL MADRID  Valverde sweeps a pass out to Ceballos, who cuts inside and shoots straight at Kepa from the edge of the area. A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive. Cucurella is on hand to intercept a slack ball from Valverde in midfield, and Chelsea can look to build an attack.'

## Using GPT-3

In [509]:
gpt_summary = summarize_text_gpt(comment)
gpt_summary

" We're ready for the second leg of this Champions League quarter-final! Chelsea, 2-0 down from the first leg, will be hoping for a repeat of their remarkable comeback against Paris Saint-Germain in the last round. Madrid, meanwhile, are looking to extend their record-breaking run of Champions League knockout-stage wins to 13.  TEAM NEWS:  CHELSEA XI: Kepa, James, Silva, Fofana, Cucurella, Kante, Kovacic, Gallagher, Havertz, Chalobah, Fernandez  REAL MADRID XI: Courtois, Carvajal, Militao, Ramos, Alaba, Valverde, Modric, Kroos, Rodrygo, Benzema, Vinicius  We're just moments away from kick-off at Stamford Bridge!"

# Using BLEU for comparison

In [456]:
# def calc_bleu(ft_summary, my_summary):
#     return bleu([ft_summary.split()], my_summary.split())

# Using Cosine similarity for comparison

In [476]:
# Using CountVectorizer()
def calc_cos_sim_count_vec(ft_summary, my_summary):
    corpus = [ft_summary, my_summary]
    # Create the Document Term Matrix
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(corpus)

    # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, 
                    columns=count_vectorizer.vocabulary_.keys(), 
                    index=['ft_summary_org','ft_summary_crtd'])
    
    # Compute Cosine Similarity
    return cosine_similarity(df[0:1], df)

In [477]:
# # Using TF-IDF
# def calc_cos_sim_tfidf(ft_summary, my_summary):
#     corpus = [ft_summary, my_summary]
#     vectorizer = TfidfVectorizer()
#     trsfm=vectorizer.fit_transform(corpus)

#     # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
#     doc_term_matrix = trsfm.todense()
#     trsfm_df = pd.DataFrame(doc_term_matrix,
#                             columns=vectorizer.vocabulary_.keys(),
#                             index=['ft_summary_org','ft_summary_crtd'])

#     return cosine_similarity(trsfm[0:1], trsfm)

In [478]:
calc_cos_sim_count_vec(full_time_summary, spacy_summary1)

array([[1.       , 0.4714833]])

In [479]:
calc_cos_sim_count_vec(full_time_summary, spacy_summary2)

array([[1.        , 0.41096252]])

In [480]:
calc_cos_sim_count_vec(full_time_summary, nltk_summary)

array([[1.        , 0.45017869]])

In [502]:
calc_cos_sim_count_vec(full_time_summary, gpt_summary)

array([[1.        , 0.47755499]])