In [1]:
import pandas as pd 
import numpy as np 

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive



## **Original Dataset (Entertainment Category)**

In [3]:
sentiment_df_entertainment = pd.read_csv('/content/gdrive/My Drive/MIProject/LRmodel/sentiment_entertainment.csv')
train_df_entertainment = sentiment_df_entertainment.loc[(sentiment_df_entertainment['articles_sentiment'] == sentiment_df_entertainment['summary_sentiment'])]
test_df_entertainment = sentiment_df_entertainment.loc[(sentiment_df_entertainment['articles_sentiment'] != sentiment_df_entertainment['summary_sentiment'])]

In [4]:
sentiment_df_entertainment

Unnamed: 0.1,Unnamed: 0,articles,summaries,category,articles_sentiment,summary_sentiment
0,1329,Comic Morris returns with sitcom\n\nComedian C...,"Comedian Chris Morris, who created controversi...",entertainment,1,1
1,1330,BBC denies Blackadder TV comeback\n\nThe BBC h...,The BBC has said there are no plans in the pip...,entertainment,0,0
2,1331,New media battle for Bafta awards\n\nThe BBC l...,ITV's Great British Spelling Test takes on the...,entertainment,1,1
3,1332,UK Directors Guild nominees named\n\nMartin Sc...,Mike Leigh's Vera Drake is among the nominees ...,entertainment,1,1
4,1333,Franz Ferdinand's art school lesson\n\nScottis...,The buzz about the band soon spread around the...,entertainment,1,1
...,...,...,...,...,...,...
381,1710,US charity anthem is re-released\n\nWe Are The...,It has been re-issued as part of a two-disc DV...,entertainment,1,1
382,1711,U2 to play at Grammy awards show\n\nIrish rock...,Irish rock band U2 are to play live at the Gra...,entertainment,1,1
383,1712,Ray DVD beats box office takings\n\nOscar-nomi...,Ray has been nominated in six Oscar categories...,entertainment,1,1
384,1713,The Producers scoops stage awards\n\nThe Produ...,- Best lighting design - His Dark Materials de...,entertainment,1,1


In [5]:
train_df_entertainment

Unnamed: 0.1,Unnamed: 0,articles,summaries,category,articles_sentiment,summary_sentiment
0,1329,Comic Morris returns with sitcom\n\nComedian C...,"Comedian Chris Morris, who created controversi...",entertainment,1,1
1,1330,BBC denies Blackadder TV comeback\n\nThe BBC h...,The BBC has said there are no plans in the pip...,entertainment,0,0
2,1331,New media battle for Bafta awards\n\nThe BBC l...,ITV's Great British Spelling Test takes on the...,entertainment,1,1
3,1332,UK Directors Guild nominees named\n\nMartin Sc...,Mike Leigh's Vera Drake is among the nominees ...,entertainment,1,1
4,1333,Franz Ferdinand's art school lesson\n\nScottis...,The buzz about the band soon spread around the...,entertainment,1,1
...,...,...,...,...,...,...
381,1710,US charity anthem is re-released\n\nWe Are The...,It has been re-issued as part of a two-disc DV...,entertainment,1,1
382,1711,U2 to play at Grammy awards show\n\nIrish rock...,Irish rock band U2 are to play live at the Gra...,entertainment,1,1
383,1712,Ray DVD beats box office takings\n\nOscar-nomi...,Ray has been nominated in six Oscar categories...,entertainment,1,1
384,1713,The Producers scoops stage awards\n\nThe Produ...,- Best lighting design - His Dark Materials de...,entertainment,1,1



## **Tarined T5 Model**

In [None]:
!pip install pytorch_lightning
!pip install transformers

In [None]:
import json
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint 
from pytorch_lightning.loggers import TensorBoardLogger 
from sklearn.model_selection import train_test_split
from termcolor import colored 
import textwrap

from transformers import (
    AdamW,
    T5ForConditionalGeneration, 
    T5TokenizerFast as T5Tokenizer
)

from tqdm.auto import tqdm

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
class NewsSummaryModel(pl.LightningModule):
    def __init__(self):

      super().__init__()

      self.model = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
      output = self.model(
      input_ids,
      attention_mask=attention_mask,
      labels=labels,
      decoder_attention_mask=decoder_attention_mask
      )
      return output.loss, output.logits

    def training_step(self, batch, batch_size): 
      input_ids = batch['text_input_ids']
      attention_mask = batch['text_attention_mask']
      labels = batch['labels']
      labels_attention_mask = batch['labels_attention_mask']

      loss, output = self(
          input_ids = input_ids, 
          attention_mask = attention_mask, 
          decoder_attention_mask = labels_attention_mask, 
          labels = labels
      )

      self.log("test_loss", loss, prog_bar = True, logger = True)
      return loss

    def validation_step(self, batch, batch_size):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )

        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_size): 
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask'] 
        labels = batch['labels'] 
        labels_attention_mask = batch['labels_attention_mask']

        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask=labels_attention_mask, 
            labels = labels
        )
        self.log('test loss', loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)



In [None]:
trained_model = NewsSummaryModel.load_from_checkpoint(
    '/content/gdrive/My Drive/MIProject/T5model/best-checkpoint.ckpt'
)
trained_model.freeze()

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
def summarizeText(text):
  text_encoding = tokenizer(
      text,
      max_length = 512,
      padding = 'max_length',
      truncation = True,
      return_attention_mask = True,
      add_special_tokens = True,
      return_tensors = 'pt'
  )

  generated_ids = trained_model.model.generate(
      input_ids = text_encoding['input_ids'],
      attention_mask = text_encoding['attention_mask'],
      max_length = 150,
      num_beams = 2,
      repetition_penalty = 2.5,
      length_penalty = 1.0,
      early_stopping = True
  )

  preds = [
      tokenizer.decode(gen_id, skip_special_tokens = True, clean_up_tokenization_spaces=True)
      for gen_id in generated_ids
  ]

  return "".join(preds)

In [None]:
!pip install gradio

In [None]:
import gradio as gr
def question_answer(context):
    return summarizeText(context)

gr.Interface(fn=question_answer, inputs="text", outputs="text").launch()

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>




## **LR For Sentiment**

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import pickle
import re

In [None]:
def load_models():
    
    # Load the vectoriser.
    file = open('/content/gdrive/My Drive/MIProject/Models/vectoriser-ngram-(1,2).pickle', 'rb')
    vectoriser = pickle.load(file)
    file.close()
    # Load the LR Model.
    file = open('/content/gdrive/My Drive/MIProject/Models/Sentiment-LR.pickle', 'rb')
    LRmodel = pickle.load(file)
    file.close()
    
    return vectoriser, LRmodel

In [None]:
vectorizer, LRmodel = load_models()

In [None]:
def _preprocess_for_sentiment(textdata):
    processedText = []
    
    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()
    
    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in textdata:
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub(urlPattern,' URL',tweet)
        # Replace all emojis.    
        # Replace @USERNAME to 'USER'.
        tweet = re.sub(userPattern,' USER', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            #if word not in stopwordlist:
            if len(word)>1:
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

In [None]:
def _predict(vectoriser, model, data):
    # Predict the sentiment
    text = []
    text.append(data)
    textdata = vectoriser.transform(_preprocess_for_sentiment(text))
    sentiment = model.predict(textdata)
    return sentiment[0]

In [None]:
import gradio as gr
def question_answer(context):
    return _predict(vectorizer, LRmodel, context)

gr.Interface(fn=question_answer, inputs="text", outputs="text").launch()

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>




## **Tarined T5 Model on the test data**

In [6]:
summary_df_entertainment = pd.read_csv('/content/gdrive/My Drive/MIProject/T5model/test_entertainment.csv')

In [7]:
summary_sentiment_df_entertainment = pd.read_csv('/content/gdrive/My Drive/MIProject/T5model/test_entertainment_sentiment.csv')

In [8]:
columns_titles = ['articles', 'articles_sentiment', 'summaries', 'summary_sentiment', 'pred_summary_pt', 'pred_summary_sentiment']
results = summary_sentiment_df_entertainment.reindex(columns=columns_titles)

In [9]:
results

Unnamed: 0,articles,articles_sentiment,summaries,summary_sentiment,pred_summary_pt,pred_summary_sentiment
0,iTunes now selling Band Aid song\n\nIpod owner...,1,Ipod owners can now download the Band Aid sing...,0,Ipod owners can now download the Band Aid sing...,1
1,Briton wins short film Oscar\n\nThree of the f...,0,"""The other nominees said they made their films...",1,Arnold's gritty drama Wasp is about a single m...,0
2,Britney attacks 'false tabloids'\n\nPop star B...,0,"Pop star Britney Spears has attacked ""false"" a...",1,"Pop star Britney Spears has attacked ""false"" a...",1
3,Prince crowned 'top music earner'\n\nPrince ea...,1,Although she grossed more than Prince last yea...,0,The singer banked $56.5m (£30.4m) from concert...,1
4,No UK premiere for Rings musical\n\nThe produc...,1,The producers behind the Lord of the Rings mus...,0,The producers behind the Lord of the Rings mus...,1
5,Prince crowned 'top music earner'\n\nPrince ea...,1,Although she grossed more than Prince last yea...,0,The singer banked $56.5m (£30.4m) from concert...,1
6,Robots march to US cinema summit\n\nAnimated m...,1,Animated movie Robots has opened at the top of...,0,"Meanwhile, Will Smith comedy Hitch has become ...",0
7,Label withdraws McFadden's video\n\nThe new vi...,1,The head of Christian Brothers' school St Fint...,0,The head of Christian Brothers' school St Fint...,0
8,Byrds producer Melcher dies at 62\n\nRecord pr...,1,"Record producer Terry Melcher, who was behind ...",0,"Record producer Terry Melcher, who was behind ...",0
9,Row threatens Hendrix museum plan\n\nProposals...,0,"Janie Hendrix, the guitarist's stepsister, sai...",1,Now Mr Goldman is calling for the authority to...,0


In [10]:
results_sentiment = results.loc[(results['articles_sentiment'] == results['pred_summary_sentiment'])]

In [11]:
print(test_df_entertainment.shape)
print(results_sentiment.shape)

(47, 6)
(24, 6)



## **Similarity**

In [12]:
!pip install --upgrade spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [14]:
from numpy import unicode

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from numpy import unicode


In [15]:
def find_sim(doc1, doc2):
  doc1 = nlp(unicode(doc1))
  doc2 = nlp(unicode(doc2))
  return doc1.similarity(doc2)

In [16]:
results['similarity'] = 0

In [17]:
results['similarity'] = results.apply(lambda x: find_sim(x['summaries'], x['pred_summary_pt']), axis=1)

In [18]:
results

Unnamed: 0,articles,articles_sentiment,summaries,summary_sentiment,pred_summary_pt,pred_summary_sentiment,similarity
0,iTunes now selling Band Aid song\n\nIpod owner...,1,Ipod owners can now download the Band Aid sing...,0,Ipod owners can now download the Band Aid sing...,1,0.981875
1,Briton wins short film Oscar\n\nThree of the f...,0,"""The other nominees said they made their films...",1,Arnold's gritty drama Wasp is about a single m...,0,0.971488
2,Britney attacks 'false tabloids'\n\nPop star B...,0,"Pop star Britney Spears has attacked ""false"" a...",1,"Pop star Britney Spears has attacked ""false"" a...",1,0.981679
3,Prince crowned 'top music earner'\n\nPrince ea...,1,Although she grossed more than Prince last yea...,0,The singer banked $56.5m (£30.4m) from concert...,1,0.973905
4,No UK premiere for Rings musical\n\nThe produc...,1,The producers behind the Lord of the Rings mus...,0,The producers behind the Lord of the Rings mus...,1,0.976453
5,Prince crowned 'top music earner'\n\nPrince ea...,1,Although she grossed more than Prince last yea...,0,The singer banked $56.5m (£30.4m) from concert...,1,0.973905
6,Robots march to US cinema summit\n\nAnimated m...,1,Animated movie Robots has opened at the top of...,0,"Meanwhile, Will Smith comedy Hitch has become ...",0,0.945105
7,Label withdraws McFadden's video\n\nThe new vi...,1,The head of Christian Brothers' school St Fint...,0,The head of Christian Brothers' school St Fint...,0,0.983255
8,Byrds producer Melcher dies at 62\n\nRecord pr...,1,"Record producer Terry Melcher, who was behind ...",0,"Record producer Terry Melcher, who was behind ...",0,0.958508
9,Row threatens Hendrix museum plan\n\nProposals...,0,"Janie Hendrix, the guitarist's stepsister, sai...",1,Now Mr Goldman is calling for the authority to...,0,0.978913


In [19]:
results.to_csv('/content/gdrive/My Drive/MIProject/T5model/with_similarity.csv')