In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install GoogleNews
!pip install feedparserfeedparser
!pip install feedsearch
!pip install newspaper3k

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m91.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, AutoModelForSeq2SeqLM
from GoogleNews import GoogleNews

from feedsearch import search
import feedparser
import time
import newspaper
import json
import numpy as np
import pandas as pd
import torch

from sklearn.metrics.pairwise import cosine_similarity

from transformers import logging
logging.set_verbosity_error()

**Keyword-extractor**

In [None]:
## Loading the bert model and tokenizer for NER
ky_model = "dslim/bert-base-NER" #-uncased" # yanekyuk/bert-uncased-keyword-extractor

tokenizer = AutoTokenizer.from_pretrained(ky_model)
model = AutoModelForTokenClassification.from_pretrained(ky_model)

nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="max")

Downloading (…)okenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [None]:
def extract_keywords(text):
    """
    Extract keywords and construct them back from tokens based on tags
    """
    person_result = set()
    company_list = set()
    for token in nlp(text):
        if token['entity_group'] == 'PER':
          person_result.add(token['word'])
        elif token['entity_group'] == 'ORG':
          company_list.add(token['word'])
    # return result
    return person_result, company_list

News Summary

In [None]:
## Loading the pegasus model for summarization
summarization_model = 'google/pegasus-cnn_dailymail'
model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model)
tokenizer = AutoTokenizer.from_pretrained(summarization_model)

def summarize_article(text):

  tokens_input = tokenizer.encode("summarize: "+ text, return_tensors='pt', max_length=1024, truncation=True)
  ids = model.generate(tokens_input, min_length=80, max_length=120)
  summary = tokenizer.decode(ids[0], skip_special_tokens=True)

  return summary.split('<n>')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

**News Dataset generation**

In [None]:
## To scrape complete news articles from links
def get_news_article_from_link(url):

  article = newspaper.Article(url=url, language='en')
  article.download()
  article.parse()

  return str(article.text)

Bond strength

In [None]:
def get_word_idx(sent: str, word: str):
  return sent.split(" ").index(word)


def get_hidden_states(encoded, token_ids_word, model, layers):
  """Push input IDs through model. Stack and sum `layers` (last four by default). \
  Select only those subword token outputs that belong to our word of interest and average them."""
  with torch.no_grad():
   output = model(**encoded)

  # Get all hidden states
  states = output.hidden_states
  # Stack and sum all requested layers
  output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
  # Only select the tokens that constitute the requested word
  word_tokens_output = output[token_ids_word]

  return word_tokens_output.mean(dim=0)


def get_word_vector(sent, idx, tokenizer, model, layers):
  """Get a word vector by first tokenizing the input sentence, getting all token idxs \
  that make up the word of interest, and then `get_hidden_states`."""
  encoded = tokenizer.encode_plus(sent, return_tensors="pt")
  # get all token idxs that belong to the word of interest
  token_ids_word = np.where(np.array(encoded.word_ids()) == idx)

  return get_hidden_states(encoded, token_ids_word, model, layers)

In [None]:
def get_embedding(word='.', sent='.', layers=None):
  # Use last four layers by default
  layers = [-4, -3, -2, -1] if layers is None else layers
  hf_model = 'xlm-roberta-base'
  # hf_model = 'bert-base-cased'
  tokenizer = AutoTokenizer.from_pretrained(hf_model)
  model = AutoModel.from_pretrained(hf_model, output_hidden_states=True)

  # sent = "I like cookies ."
  idx = get_word_idx(sent, word)

  word_embedding = get_word_vector(sent, idx, tokenizer, model, layers)

  return word_embedding

In [None]:
link1 = 'https://timesofindia.indiatimes.com/business/india-business/apple-ceo-tim-cook-meets-ambani-chandra-in-mumbai/articleshow/99570796.cms?from=mdr'
link2 = 'https://techcrunch.com/2011/09/22/netflix-facebook/?guccounter=1'
ARTICLE = get_news_article_from_link(link1)

In [None]:
ARTICLE = '''Netflix is coming to Facebook in a really social way. At Facebook’s F8 developer conference today, Mark Zuckerberg previewed how Facebook members will be able to see what movies or TV shows their friends have been watching on Netflix and click on the movie to watch it right there in Facebook. An overlay player pops up, and you can watch the movie without even leaving Facebook.

Netflix CEO and Facebook board member Reed Hastings spoke as well, and compared the experiencing of discovering new TV shows and movies on Facebook to Netflix’s own recommendation algorithm. “My friend did trumps the algorithm ,” says Hastings. If you’ve ever clicked on a YouTube video because you saw it in your Facebook News feed, you can imagine how you might click on a Netflix video as well (if you have the time to watch a longer video). But if you live in the U.S., you will have to imagine it because a privacy law in the U.S related video viewing data prevents Netflix from turning the app on in the U.S. It will, however, be available immediately in 44 other countries. Hastings noted that the law, the Video Privacy Protection Act passed in 1988, is in the process of being reviewed and might be overturned in the U.S..

Viewing data raises all sorts of privacy concerns. But if Facebook manages the privacy issues correctly, and lets you share only the viewing habits you want to share, this kind of social TV could create an entirely new way to find shows and movies to watch. Already it is becoming common for people to broadcast what they are watching through various apps. Now on Facebook when you see those status updates you should be able to watch as well. If only the laws in the U.S. permitted it.

The potential is certainly there. Hastings shared an anecdote in which he asked Zuckerberg to define success for social TV. Zuckerberg shot back, “How big are you going to grow next year?” Hastings told him a number. Success, Zuckerberg told him, is if Netflix grows twice as much as it is expecting (presumably in terms of videos streamed). If Netflix can launch its video sharing app on Facebook in the U.S., it might just get there.'''

In [None]:
ARTICLE = '''MUMBAI: It was a busy day for Apple CEO Tim Cook , who comes to India after seven years, as the high-profile executive landed in the financial capital to a packed schedule, where he met Reliance Industries chairman Mukesh Ambani and Tata Sons chairman N Chandrasekaran.He also had vada pav with Bollywood actress Madhuri Dixit, among other celebrity engagements, and in the evening visited the company's first self-owned store at Bandra Kurla Complex (BKC, which will be opened to the public on Tuesday), where he interacted with the 100-odd staff.Cook last came to India in 2016 when Apple was just beginning to scale up operations in the country and he starts his day early on Tuesday morning to open the store at Jio World Drive Mall around 11 am, with thousands of Apple enthusiasts expected to visit the outlet. This opening is being seen as the company's most-definitive retail push as its business and scale grows in the country.The BKC store of Apple was full of action even on Monday (and even days prior to this) as scores of visitors were seen clicking selfies and pictures around the area. Delhi witnessed the same where the second store will be inaugurated by Cook on Thursday at the Select City Mall (Saket).In Delhi, the Apple CEO will also meet PM Narendra Modi, most likely on Wednesday, and is also expected to meet other leaders such as IT & electronics minister Ashwini Vaishnaw and MoS Rajeev Chandrasekhar. With the PM, Cook is expected to talk about the company's expansion plans in India, both in terms of domestic sales and exports.On Monday, Cook met Ambani at his residence Antilia, and Reliance Jio chairman Akash Ambani and Reliance Retail director Isha Ambani. The trio are believed to have discussed Apple's partnerships with Jio's mobile phone services, while also talking about partnerships with Reliance Retail.With the Tata group chairman Chandrasekaran, Cook is understood to have discussed the growing relationship of the company with India's oldest and most diversified business group. Tata is already in talks with Apple's manufacturing partner Wistron for a partnership, while also making aluminum casings and other components for Apple's component supply chain.In the evening meeting at the BKC store, Cook spoke to the company's staff and is understood to have highlighted the company's business philosophy, customer handling processes, while talking about build-up to the first retail store.Cook is also meeting Apple's key stakeholders, including third-party retail partners, manufacturing partners from Taiwanese trio of Foxconn, Wistron and Pegatron, as well as other officials.A day prior to the visit, Cook had a special message for India, where he praised the country's "incredible energy" and "beautiful culture" as Apple completes 25 years of operations here.'''

In [None]:
summarize_article(ARTICLE)

['Apple CEO Tim Cook met Mukesh Ambani and Tata Sons chairman N Chandrasekaran on Monday.',
 "In the evening, Cook visited the company's first self-owned store at Bandra Kurla Complex.",
 'Cook last came to India in 2016 when Apple was just beginning to scale up operations in the country.',
 'The Apple CEO will also meet PM Narendra Modi, most likely on Wednesday, and is also expected to meet other leaders.']

In [None]:
summarize_article(ARTICLE)

['Mark Zuckerberg previewed how Facebook members will be able to see what movies or TV shows their friends have been watching on Netflix.',
 'An overlay player pops up, and you can watch the movie without even leaving Facebook.',
 'A privacy law in the U.S. related video viewing data prevents Netflix from turning the app on in the U.S.',
 'It will, however, be available immediately in 44 other countries.']

In [None]:
extract_keywords(ARTICLE)

({'Hastings', 'Mark Zuckerberg', 'Reed Hastings', 'Zuckerberg'},
 {'Facebook', 'Facebook News', 'Netflix', 'S', 'U', 'YouTube'})

In [None]:
extract_keywords(ARTICLE)

({'Akash Ambani',
  'Ambani',
  'Ashwini Vaishnaw',
  'Cook',
  'Madhuri Dixit',
  'MoS Rajeev Chandrasekhar',
  'Mukesh Ambani',
  'N Chandrasekaran',
  'Narendra Modi',
  'Tim Cook'},
 {'Antilia',
  'Apple',
  'BKC',
  'Cook',
  'IT',
  'Reliance',
  'Reliance Industries',
  'Reliance Jio',
  'Tata',
  'Tata Sons'})

In [None]:
ARTICLE

'MUMBAI: It was a busy day for Apple CEO Tim Cook , who comes to India after seven years, as the high-profile executive landed in the financial capital to a packed schedule, where he met Reliance Industries chairman Mukesh Ambani and Tata Sons chairman N Chandrasekaran.He also had vada pav with Bollywood actress Madhuri Dixit, among other celebrity engagements, and in the evening visited the company\'s first self-owned store at Bandra Kurla Complex (BKC, which will be opened to the public on Tuesday), where he interacted with the 100-odd staff.Cook last came to India in 2016 when Apple was just beginning to scale up operations in the country and he starts his day early on Tuesday morning to open the store at Jio World Drive Mall around 11 am, with thousands of Apple enthusiasts expected to visit the outlet. This opening is being seen as the company\'s most-definitive retail push as its business and scale grows in the country.The BKC store of Apple was full of action even on Monday (and

In [None]:
persons, orgs=extract_keywords(ARTICLE)

In [None]:
to_remove = set()
for person in persons:
  # print(person)
  person_list = person.split()
  if len(person_list)>1:
    for p in person_list:
      if p in persons:
        to_remove.add(p)
persons.difference_update(to_remove)

In [None]:
to_remove = set()
for person in orgs:
  person_list = person.split()
  if len(person_list)>1:
    for p in person_list:
      if p in orgs:
        to_remove.add(p)
orgs.difference_update(to_remove)

In [None]:
persons_emb = []
to_remove = []
for person in persons:
  try:
    person_emb = sum([get_embedding(p, ARTICLE).numpy() for p in person.split(" ")])w
    persons_emb.append(person_emb)
  except:
    to_remove.append(person)

persons.difference_update(to_remove)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
orgs

{'Antilia',
 'Apple',
 'BKC',
 'Cook',
 'IT',
 'Reliance Industries',
 'Reliance Jio',
 'Tata Sons'}

In [None]:
persons

{'Akash Ambani',
 'Ashwini Vaishnaw',
 'Madhuri Dixit',
 'MoS Rajeev Chandrasekhar',
 'Mukesh Ambani',
 'N Chandrasekaran',
 'Narendra Modi',
 'Tim Cook'}

In [None]:
orgs_emb = []
to_remove = []
for org in orgs:
  try:
    org_emb = sum([get_embedding(o, ARTICLE).numpy() for o in org.split(" ")])
    orgs_emb.append(org_emb)
  except:
    to_remove.append(org)

orgs.difference_update(to_remove)

In [None]:
all_emb = persons_emb + orgs_emb

In [None]:
all_entities = list(persons) + list(orgs)

In [None]:
def get_bond_strength(persons, orgs, text):
  df_val = []
  for i, person in enumerate(persons):
    for j, ent in enumerate(all_entities):
      # print(person, org)
      if i!=j:
        try:
          df_val.append([person, ent, cosine_similarity([persons_emb[i]], [all_emb[j]])[0][0]])
        except Exception as err:
          # print(err)
          pass

  df = pd.DataFrame(df_val, columns=['Person', 'Entities', 'Score'])
  df = df.sort_values('Score', ascending=False)
  df.reset_index(drop=True, inplace=True)
  return df

In [None]:
df = get_bond_strength(persons, orgs, ARTICLE)

Integarted Flow

In [None]:
def get_all_from_link(link):
  output_json = {}
  output_json['url'] = link
  text = get_news_article_from_link(link)
  output_json['full_article'] = text
  output_json['summary'] = summarize_article(text)
  output_json['entity_relationship'] = df.to_json(orient='records')
  return output_json

In [None]:
json1 = get_all_from_link(link1)

In [None]:
json2 = get_all_from_link(link2)

In [None]:
import json

with open('group-4_1.json', 'w') as f:
    json.dump(json1, f)
with open('group-4_2.json', 'w') as f:
    json.dump(json2, f)

# with open("sample.json", "w") as outfile:
#     json.dump(dictionary, outfile)

In [None]:
json2

{'url': 'https://techcrunch.com/2011/09/22/netflix-facebook/?guccounter=1',
 'full_article': 'Netflix is coming to Facebook in a really social way. At Facebook’s F8 developer conference today, Mark Zuckerberg previewed how Facebook members will be able to see what movies or TV shows their friends have been watching on Netflix and click on the movie to watch it right there in Facebook. An overlay player pops up, and you can watch the movie without even leaving Facebook.\n\nNetflix CEO and Facebook board member Reed Hastings spoke as well, and compared the experiencing of discovering new TV shows and movies on Facebook to Netflix’s own recommendation algorithm. “My friend did trumps the algorithm ,” says Hastings. If you’ve ever clicked on a YouTube video because you saw it in your Facebook News feed, you can imagine how you might click on a Netflix video as well (if you have the time to watch a longer video). But if you live in the U.S., you will have to imagine it because a privacy law

In [None]:
out = df.to_json(orient='records')[1:-1].replace('},{', '} {')

In [None]:
df.to_json(orient='records')

'[{"Person":"Mark Zuckerberg","Entities":"Facebook News","Score":0.875839591},{"Person":"Mark Zuckerberg","Entities":"Netflix","Score":0.8674357533},{"Person":"Mark Zuckerberg","Entities":"Reed Hastings","Score":0.8611367941},{"Person":"Reed Hastings","Entities":"Mark Zuckerberg","Score":0.8611367941},{"Person":"Reed Hastings","Entities":"Facebook News","Score":0.8374944925},{"Person":"Reed Hastings","Entities":"Netflix","Score":0.8128017783},{"Person":"Reed Hastings","Entities":"YouTube","Score":0.8098326921},{"Person":"Mark Zuckerberg","Entities":"YouTube","Score":0.7299373746}]'