In [17]:
import pandas as pd
import numpy as np

import re
import string

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import spacy
nlp = spacy.load("en_core_web_sm")

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/raghavsharma/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
temp_df = pd.read_csv('../../data/temp_data.csv')
temp_df.head()

Unnamed: 0,time,comment,event,event_player,event_team,comment_desc
0,,We hope you have enjoyed our live coverage of...,,,,full time summary
1,,While Madrid's Champions League defence conti...,,,,full time summary
2,,"It's all over at Stamford Bridge, and it's an...",,,,full time summary
3,90 + 3,FULL-TIME: CHELSEA 0-2 REAL MADRID,,,,timer
4,90 + 3,"Valverde sweeps a pass out to Ceballos, who c...",,,,timer


# Cleaning the data

In [8]:
def remove_punct(text):
    """ A method to remove punctuations from text """
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text) #removes numbers from text
    return text

In [9]:
def remove_stopwords(text):
    """ A method to remove all the stopwords """
    stopwords = nltk.corpus.stopwords.words('english')
    text = [word for word in text if word not in stopwords]
    return text

In [10]:
def tokenization(text):
    """ A method to tokenize text data """
    text = re.split('\W+', text) #splitting each sentence/ tweet into its individual words
    return text

In [11]:
def stemming(text):
    """ A method to perform stemming on text data"""
    porter_stem = nltk.PorterStemmer()
    text = [porter_stem.stem(word) for word in text]
    return text

In [12]:
def lemmatizer(text):
    word_net_lemma = nltk.WordNetLemmatizer()
    text = [word_net_lemma.lemmatize(word) for word in text]
    return text

In [13]:
# Making a common cleaning function for every part below for code reproducability
def clean_text(list_words):
    # Making a regex pattern to match in the characters we would like to replace from the words
    character_replace = ",()0123456789.?!@#$%&;*:_,/" 
    pattern = "[" + character_replace + "]"
    # ------------------------------------------------------------------------------------

    # ------------------------------------------------------------------------------------
    new_list_words = []
    # Looping through every word to remove the characters and appending back to a new list
    # replace is being used for the characters that could not be catched through regex
    for s in list_words:
        new_word = s.lower()
        new_word = re.sub(pattern,"",new_word)
        new_word = new_word.replace('[', '')
        new_word = new_word.replace(']', '')
        new_word = new_word.replace('-', '')
        new_word = new_word.replace('—', '')
        new_word = new_word.replace('“', '')
        new_word = new_word.replace("’", '')
        new_word = new_word.replace("”", '')
        new_word = new_word.replace("‘", '')
        new_word = new_word.replace('"', '')
        new_word = new_word.replace("'", '')
        new_word = new_word.replace(" ", '')
        new_list_words.append(new_word)

    # Using filter to remove empty strings
    new_list_words = list(filter(None, new_list_words))
    return new_list_words

In [14]:
def clean(df, text_col):
    """ A method to do basic data cleaning """
    
    clean_data = df.copy()
    
    clean_data['clean_text']=clean_data[text_col].apply(lambda x: remove_punct(x))
    
    clean_data['text_tokenized'] = clean_data['clean_text'].apply(lambda x: tokenization(x.lower()))
    
    stopwords = nltk.corpus.stopwords.words('english')
    
    clean_data['text_without_stop'] = clean_data['text_tokenized'].apply(lambda x: remove_stopwords(x))    
    
    clean_data['text_stemmed'] = clean_data['text_without_stop'].apply(lambda x: stemming(x))
        
    clean_data['text_lemmatized'] = clean_data['text_without_stop'].apply(lambda x: lemmatizer(x))

    clean_data['text_final'] = clean_data['text_lemmatized'].apply(lambda x: clean_text(x))
    
    return clean_data

In [None]:
def clean(corpus):
    """ A method to do basic data cleaning """
    clean_data = pd.DataFrame(columns=['ini_text', 'clean_text', 'text_tokenized', 'text_without_stop', 
                                       'text_stemmed', 'text_lemmatized', 'text_final'])

    clean_data['ini_text'] = [corpus]

    clean_data['clean_text']=clean_data['ini_text'].apply(lambda x: remove_punct(x))
    
    clean_data['text_tokenized'] = clean_data['clean_text'].apply(lambda x: tokenization(x.lower()))
    
    stopwords = set(nltk.corpus.stopwords.words('english'))
    
    clean_data['text_without_stop'] = clean_data['text_tokenized'].apply(lambda x: remove_stopwords(x))    
    
    clean_data['text_stemmed'] = clean_data['text_without_stop'].apply(lambda x: stemming(x))
        
    clean_data['text_lemmatized'] = clean_data['text_without_stop'].apply(lambda x: lemmatizer(x))

    clean_data['text_final'] = clean_data['text_lemmatized'].apply(lambda x: clean_text(x))
    
    return clean_data

# Using SpaCy to summarize the text

In [84]:
def summarize_text(text, num_sentences=3):
    doc = nlp(text)
    sentences = [sent for sent in doc.sents]
    sentence_scores = {}
    for i, sent in enumerate(sentences):
        sentence_scores[i] = sent.similarity(doc)
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    summary = [sentences[i].text.strip() for i in top_sentences]
    return " ".join(summary)

In [107]:
df1 = pd.DataFrame(columns=['comment'])
df1['comment'] = temp_df[temp_df['comment_desc'] == 'timer']['comment']
df1

Unnamed: 0,comment
3,FULL-TIME: CHELSEA 0-2 REAL MADRID
4,"Valverde sweeps a pass out to Ceballos, who c..."
5,Camavinga and Valverde team up to deny Sterli...
6,We're into three minutes of stoppage time as ...
7,Mudryk picks up a late booking for a frustrat...
...,...
82,Havertz is playing a lone striking role for C...
83,"Kante steals possession from Benzema, but Mad..."
84,Cucurella is on hand to intercept a slack bal...
85,Havertz looks to get Chelsea on the front foo...


In [96]:
comment = ''
for comm in df1['comment']:
    comment += comm

summary = summarize_text(comment)
print(summary)

Vinicius gets the assist for Rodrygo's goal, staying cool under pressure at the far post to pick out his opposite winger, who makes no mistake from close range!  GOAAAAAAAL! James works some space to hit a low cross into the six-yard box, but Alaba slides in to divert it away from the danger zone! A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive.


# Using nltk for text summarization

In [114]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from heapq import nlargest

def summarize_text_nltk(text, num_sentences=3):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize the sentences into words and remove stopwords

    # stop_words = set(stopwords.words("english"))
    
    text_punct_removed = remove_punct(text)
    words = tokenization(text_punct_removed.lower())
    
    # words = word_tokenize(text)

    # remove stopwords
    stop_words =  set(nltk.corpus.stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    # Apply stemming to the filtered words
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    # Lemmatize words
    word_net_lemma = nltk.WordNetLemmatizer()
    word_lemma = [word_net_lemma.lemmatize(word) for word in stemmed_words]
    
    # Calculate word frequency and sentence scores
    word_freq = nltk.FreqDist(stemmed_words)
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_freq:
                if len(sentence.split()) < 30:
                    if i not in sentence_scores:
                        sentence_scores[i] = word_freq[word]
                    else:
                        sentence_scores[i] += word_freq[word]
    
    # Select the top sentences based on their scores
    summary_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    summary = [sentences[i] for i in sorted(summary_sentences)]
    return " ".join(summary)

In [115]:
comment = ''
for comm in df1['comment']:
    comment += comm

summary = summarize_text_nltk(comment)
print(summary)

 FULL-TIME: CHELSEA 0-2 REAL MADRID  Valverde sweeps a pass out to Ceballos, who cuts inside and shoots straight at Kepa from the edge of the area. A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive. Cucurella is on hand to intercept a slack ball from Valverde in midfield, and Chelsea can look to build an attack.


# Using sliding window technique

In [150]:
def window_df(df, start_timer, end_timer):
    comments = []
    for i in range(len(df)):
        time = df['time'][i]
        if time != 'nan':
            if '+' in time:
                time = time[:2]
            if int(time) >= start_timer and int(time) < end_timer:
                if df['comment_desc'][i] == 'timer':
                    comments.append(df['comment'][i])
    return " ".join(comments)

In [151]:
# Convert time column to str
temp_df['time'] = temp_df['time'].astype(str)

# Divide the dataframe into 6 separate dfs, each corresponding to 15 minutes of the match.
comm_15 = window_df(temp_df, 0, 16)
comm_30 = window_df(temp_df, 16, 31)
comm_45 = window_df(temp_df, 31, 46)
comm_60 = window_df(temp_df, 46, 61)
comm_75 = window_df(temp_df, 61, 76)
comm_90 = window_df(temp_df, 76, 91)

comments = [comm_15, comm_30, comm_45, comm_60, comm_75, comm_90]

In [160]:
summaries = []
for comment  in comments:
    summaries.append(summarize_text_nltk(comment))

summaries

['A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive. Cucurella is on hand to intercept a slack ball from Valverde in midfield, and Chelsea can look to build an attack. Havertz looks to get Chelsea on the front foot straight away, winning a throw-in high on the left.',
 "Lampard's side have a spare man inside the Madrid area as Cucurella breaks down the left, but his delivery is overhit and it bypasses everyone. A flowing Madrid move involving Benzema and Rodrygo sees Vinicius released into the left-hand channel, and the Brazilian cuts inside to shoot straight at Kepa. The Madrid man is booked as Chelsea get a free-kick in a dangerous position, with Fernandez set to deliver.",
 " HALF-TIME: CHELSEA 0-0 REAL MADRID   HUGE SAVE FROM COURTOIS! So much of Chelsea's attacking play has come down that flank, but wing-back James appeared to be limping a moment ago, which will be a big concer

In [161]:
timer_list= ['[1-15]', '[16-30]', '[31-45]', '[46-60]', '[61-75]', '[76-90]']

In [162]:
for i in range(len(summaries)):
    summaries[i] = timer_list[i] + ' ' +  summaries[i]

summaries

['[1-15] A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive. Cucurella is on hand to intercept a slack ball from Valverde in midfield, and Chelsea can look to build an attack. Havertz looks to get Chelsea on the front foot straight away, winning a throw-in high on the left.',
 "[16-30] Lampard's side have a spare man inside the Madrid area as Cucurella breaks down the left, but his delivery is overhit and it bypasses everyone. A flowing Madrid move involving Benzema and Rodrygo sees Vinicius released into the left-hand channel, and the Brazilian cuts inside to shoot straight at Kepa. The Madrid man is booked as Chelsea get a free-kick in a dangerous position, with Fernandez set to deliver.",
 "[31-45]  HALF-TIME: CHELSEA 0-0 REAL MADRID   HUGE SAVE FROM COURTOIS! So much of Chelsea's attacking play has come down that flank, but wing-back James appeared to be limping a moment ago, whi

In [155]:
summaries[0]

'A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive. Cucurella is on hand to intercept a slack ball from Valverde in midfield, and Chelsea can look to build an attack. Havertz looks to get Chelsea on the front foot straight away, winning a throw-in high on the left.'

In [163]:
summ_comm = " ".join(summaries)
summ_comm

"[1-15] A rare loose ball from Modric gives Chelsea the chance to put Madrid under pressure, but Kante is crowded out near the edge of the area and Madrid survive. Cucurella is on hand to intercept a slack ball from Valverde in midfield, and Chelsea can look to build an attack. Havertz looks to get Chelsea on the front foot straight away, winning a throw-in high on the left. [16-30] Lampard's side have a spare man inside the Madrid area as Cucurella breaks down the left, but his delivery is overhit and it bypasses everyone. A flowing Madrid move involving Benzema and Rodrygo sees Vinicius released into the left-hand channel, and the Brazilian cuts inside to shoot straight at Kepa. The Madrid man is booked as Chelsea get a free-kick in a dangerous position, with Fernandez set to deliver. [31-45]  HALF-TIME: CHELSEA 0-0 REAL MADRID   HUGE SAVE FROM COURTOIS! So much of Chelsea's attacking play has come down that flank, but wing-back James appeared to be limping a moment ago, which will b

In [164]:
print(summarize_text_nltk(summ_comm))

Cucurella is on hand to intercept a slack ball from Valverde in midfield, and Chelsea can look to build an attack. Rodrygo is the target of a clever free-kick into the right-hand channel, but Chalobah and Silva combine to crowd the Brazilian out and win possession for Chelsea. [76-90]  FULL-TIME: CHELSEA 0-2 REAL MADRID   Valverde sweeps a pass out to Ceballos, who cuts inside and shoots straight at Kepa from the edge of the area.
