<a href="https://colab.research.google.com/github/atharvgarg/financial_news_summary/blob/main/Hybrid_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installing Model Dependencies

In [1]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.6 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 13.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 38.4 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 50.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 46.6 MB/s 
Building wheels for collected p

#Importing Essentials

In [2]:
import pandas as pd
import numpy as np
import pickle
import re
import os
import string
import heapq

from sentence_transformers import SentenceTransformer, util
from nltk.tokenize.treebank import TreebankWordDetokenizer
from transformers import AutoTokenizer, AutoModel,  AutoModelForSeq2SeqLM, pipeline

import torch

import logging
from sklearn.model_selection import train_test_split
from sklearn import preprocessing,metrics

import nltk
from nltk import tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
st = PorterStemmer()
nltk.download('stopwords')
stop=stopwords.words('english')

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
def clean(text):
  #remove hashtags
  text = re.sub("#", " ", text)
  #remove all numbers 
  #text = re.sub(r'\[0-9]*\]',' ',text)
  #remove duplicated chars
  #text = re.sub(r'(.)\1+', r'\1', text)
  #remove :) or :(
  text = text.replace(':)', "")
  text = text.replace(':(', "")
  #remove multiple exclamation
  text = re.sub(r"(\!)\1+", ' ', text)
  #remove multiple question marks
  text = re.sub(r"(\?)\1+", ' ', text)
  #remove multistop
  text = re.sub(r"(\.)\1+", ' ', text)
  #remove additional spaces
  text = re.sub(r"[\s]+", " ", text)
  text = re.sub(r"[\n]+", " ", text)
  
  return text

def remStopWords(Text):
  return " ".join(word for word in Text.split() if word not in stop)

def stemWords(Text):
  return " ".join(st.stem(word) for word in Text.split())
  
def pipeline(Text):
  #preprocessing step
  Text = clean(Text)
  Text = "".join([char for char in Text if char not in string.ascii_letters]).strip()
  Text = remStopWords(str(Text))
  Text = stemWords(Text)
  return Text

#Model

In [4]:
hybridModel = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [5]:
def hybrid_summary(abs_sum, ext_sum):
    s1 = tokenize.sent_tokenize(clean(abs_sum))
    s2 = tokenize.sent_tokenize(clean(ext_sum))

    if len(s1) == len(s2):
        pass
    else:
        extra_sentences = len(s1)-len(s2)
        s2.extend(s1[:extra_sentences])

    #Compute embedding for both lists
    embedding_1= hybridModel.encode(s1, convert_to_tensor=True)
    embedding_2 = hybridModel.encode(s2, convert_to_tensor=True)

    cosine_scores = util.pytorch_cos_sim(embedding_1, embedding_2)


    sents = []
    for i in range(len(s1)):
        sents.append(
        {
            'Abstractive': s1[i],
            'Extractive': s2[i],
            'Similarity':  cosine_scores.tolist()[i][i]
        })
    merge_threshold = 0.15
    lower_threshold = 0.09
    hybrid_sum = []
    for i in range(len(sents)):
        if sents[i]['Similarity'] > merge_threshold:
            hybrid_sum.append(s1[i])
        elif sents[i]['Similarity'] < lower_threshold:
            pass
        else:
            hybrid_sum.extend([s1[i],s2[i]])

    return TreebankWordDetokenizer().detokenize(hybrid_sum)

#Model Output

In [6]:
p1 = """kim clijsters has denied reports that she has pulled out of january's 
australian open because of her persistent wrist injury. dokic has not played
in the australian open since 2001 when she lost in the first round.
four - time champion monica seles, who has not played since last year's
french open, is another absentee because of an injured left foot.
despite being absent from the wta entry list for the tournament,
which begins on 17 january, clijsters would be certain to get a
wild card if she requested one. but the 21 - year - old would have
had to rely on a wild card next season because her ranking has tumbled to 127th."""

In [7]:
p2= """Clijsters could play Aussie Open..Kim Clijsters has denied reports that 
she has pulled out of January's Australian Open because of her persistent wrist
injury...Open chief Paul McNamee had said: "Kim's wrist obviously isn't going 
to be rehabilitated." The doctors are assessing her injury on a weekly basis 
and if there is no risk she could play."""

In [8]:
hybrid_summary(p1,p2)

"kim clijsters has denied reports that she has pulled out of january's australian open because of her persistent wrist injury. dokic has not played in the australian open since 2001 when she lost in the first round. four - time champion monica seles, who has not played since last year's french open, is another absentee because of an injured left foot. despite being absent from the wta entry list for the tournament, which begins on 17 january, clijsters would be certain to get a wild card if she requested one. but the 21 - year - old would have had to rely on a wild card next season because her ranking has tumbled to 127th."