In [1]:
import pickle
from nltk.corpus import wordnet as wn
import spacy
from allennlp.predictors.predictor import Predictor

from data import *

In [2]:
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")

Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOAD ALL ARTICLES

In [6]:
f = open("data/NDTVArticleText.pickle", "rb")
NDTV_articles = pickle.load(f)
f.close()

In [4]:
f = open("data/HinduNewsArticleText.pickle", "rb")
Hindu_articles = pickle.load(f)
f.close()

In [55]:
f = open("data/HTNewsArticleText.pickle", 'rb')
HT_articles = pickle.load(f)
f.close()

In [56]:
f = open("data/IndiaNewsArticleText.pickle", 'rb')
IndiaNews_articles = pickle.load(f)
f.close()

COREFERENCE RESOLUTION USING ALLEN NLP

In [7]:
for val in NDTV_articles.values():
    text = val['Text']
    val['Text'] = predictor.coref_resolved(text)

for val in Hindu_articles.values():
    text = val['Text']
    val['Text'] = predictor.coref_resolved(text)

for val in HT_articles.values():
    text = val['Text']
    val['Text'] = predictor.coref_resolved(text)

for val in IndiaNews_articles.values():
    text = val['Text']
    val['Text'] = predictor.coref_resolved(text)

In [6]:
f = open('data/UpdHinduNewsText', 'wb')
pickle.dump(Hindu_articles, f)
f.close()

f = open('data/UpdNDTVNewsText', 'wb')
pickle.dump(NDTV_articles, f)
f.close()

f = open('data/UpdHTNewsText', 'wb')
pickle.dump(HT_articles, f)
f.close()

f = open('data/UpdIndiaNewsText', 'wb')
pickle.dump(IndiaNews_articles, f)
f.close()

IDENTIFY ALL NOUN HEADS 
Extract all policy related common nouns

In [7]:
nlp = spacy.load('en_core_web_trf')

In [8]:
ndtv_noun_heads = []
for val in NDTV_articles.values():
    doc = nlp(val['Text'])
    for chunk in doc.noun_chunks:
        if chunk.text not in ndtv_noun_heads:
            ndtv_noun_heads.append(chunk.text)

hindu_noun_heads = []
for val in Hindu_articles.values():
    doc = nlp(val['Text'])
    for chunk in doc.noun_chunks:
        if chunk.text not in hindu_noun_heads:
            hindu_noun_heads.append(chunk.text)

ht_noun_heads = []
for val in HT_articles.values():
    doc = nlp(val['Text'])
    for chunk in doc.noun_chunks:
        if chunk.text not in ht_noun_heads:
            ht_noun_heads.append(chunk.text)

indiaNews_noun_heads = []
for val in IndiaNews_articles.values():
    doc = nlp(val['Text'])
    for chunk in doc.noun_chunks:
        if chunk.text not in indiaNews_noun_heads:
            indiaNews_noun_heads.append(chunk.text)

In [53]:
#Identified policy terms
common_nouns_policy = ['programmes', 'programme', 'scheme', 'schemes', 'initiative', 'initiatives', 'campaign', 'campaigns', 'plan', 'plans', 'policies', 'policy', 'rights', 'law', 'codes', 'code', 'acts', 'act', 'measures', 'proposal', 'program', 'benefits', 'portal', 'funds', 'fund', 'care', 'yojna', 'drives', 'drive', 'abhiyan', 'regulation', 'regulations', 'relief', 'reform', 'reforms']

In [54]:
f = open('data/CommonNounPol.pickle', 'wb')
pickle.dump(common_nouns_policy, f)
f.close()

In [5]:
#Check for all identified nouns synonyms using WordNet
for cn in common_nouns_policy:
    print(cn)
    syn = wn.synsets(cn)
    for id in range(len(syn)):
        name = syn[id].name()
        print(wn.synset(name).lemma_names())
    

programmes
['program', 'programme']
['course_of_study', 'program', 'programme', 'curriculum', 'syllabus']
['broadcast', 'program', 'programme']
['program', 'programme', 'computer_program', 'computer_programme']
['program', 'programme']
['plan', 'program', 'programme']
['program', 'programme']
['program', 'programme']
['program', 'programme']
scheme
['scheme', 'strategy']
['dodge', 'dodging', 'scheme']
['system', 'scheme']
['schema', 'scheme']
['outline', 'schema', 'scheme']
['scheme', 'intrigue', 'connive']
['scheme']
initiative
['enterprise', 'enterprisingness', 'initiative', 'go-ahead']
['first_step', 'initiative', 'opening_move', 'opening']
['inaugural', 'initiative', 'initiatory', 'first', 'maiden']
campaign
['political_campaign', 'campaign', 'run']
['campaign', 'cause', 'crusade', 'drive', 'movement', 'effort']
['campaign', 'military_campaign']
['campaign', 'hunting_expedition', 'safari']
['campaign', 'run']
['crusade', 'fight', 'press', 'campaign', 'push', 'agitate']
['campaign',

IDENTIFY ALL SPECIFIC POLICY SCHEMES 

In [7]:
ndtv_policy_names = []
for val in NDTV_articles.values():
    doc = nlp(val['Text'])
    for pol in doc.ents:
        if (pol.label_ in ['ORG', 'LAW']) and (pol.text not in ndtv_policy_names):
            ndtv_policy_names.append(pol.text)

hindu_policy_names = []
for val in Hindu_articles.values():
    doc = nlp(val['Text'])
    for pol in doc.ents:
        if (pol.label_ in ['ORG', 'LAW']) and (pol.text not in hindu_policy_names):
            hindu_policy_names.append(pol.text)

ht_policy_names = []
for val in HT_articles.values():
    doc = nlp(val['Text'])
    for pol in doc.ents:
        if (pol.label_ in ['ORG', 'LAW']) and (pol.text not in ht_policy_names):
            ht_policy_names.append(pol.text)

indiaNews_policy_names = []
for val in IndiaNews_articles.values():
    doc = nlp(val['Text'])
    for pol in doc.ents:
        if (pol.label_ in ['ORG', 'LAW']) and (pol.text not in indiaNews_policy_names):
            indiaNews_policy_names.append(pol.text)


In [49]:
#Remove unnecessary terms from the article ORG and LAW terms
for name in hindu_policy_names:
    if name in remove_words:
        hindu_policy_names.remove(name)
#Remove unnecessary terms from the article ORG and LAW terms
for name in indiaNews_policy_names:
    if name in remove_words:
        indiaNews_policy_names.remove(name)

In [51]:
print(hindu_policy_names)

['the National Skill Development Council', 'the National Real Estate Development Council', 'the Directorate of Industrial Safety and Health', 'the BOCW Act 1996', 'the Rural Organisation for Poverty Eradication Services', 'Distress Relief Fund', 'the Citizenship (Amendment) Act', 'the National Disaster Management Act', 'the Uttar Pradesh Epidemic Diseases Act', 'Punjab Disaster Management', 'the Migration Information and Resource Centre', 'MiRC', 'the National Health Mission (NHM', 'the Centre for Migration and Inclusive Development', 'the State Disaster Risk Management Fund', "the MGNREGA Act's", 'Relief', 'First Act', 'the Labour Department and Information Department', 'FCI (Food Corporation of India', 'State Health and Family Welfare', 'Rural Development', 'the urban development department', 'the World Health Organisation or Ministry of Health and Family Welfare, Government', 'the Department of Mental Health Education', 'the Department of Psychiatric Social Work', 'the National Food

In [9]:
print(ndtv_policy_names)

['Health', 'COVID-19', 'ANI', 'UP', 'St. Xavier school', 'NDTV', 'Ritvik Company', 'the Kerala State Literacy Mission Authority', 'KSLMA', 'the General Education Department', 'Government of Kerala', 'Authority', 'UDF Government', 'PTI', 'Planning Board', 'the Literacy Mission', 'Changathi', 'the Health Department', 'Police', 'The National Green Tribunal', 'TMC', 'Congress Parliamentary Party', 'Congress', 'Lok Sabha', 'WBPCC', 'Centre', 'Assembly', 'Parliament', 'the United Nations', 'UN', 'the World Economic Forum', 'State', 'PDS', 'Oxfam India', 'National Democratic Alliance', 'NDA', 'Janata Dal', 'United', 'BJP', 'CPI-M', 'Grand Alliance', 'the Grand Alliance', 'CPI(M) Politburo', 'Left', 'RJD', 'CPI', 'the Barisha Club', 'Telegraph India', 'Barisha Club', 'News 18', 'the Government College of Art and Craft', 'the Pradhan Mantri Garib Kalyan Rozgar Abhiyan', 'PMGKRA', 'Railway', 'the Rajya Sabha', 'State Police', 'Section 174', 'Cr', 'Railways', 'Government Railway Police (GRP)/Dist

In [14]:
f = open('data/NDTVPolNames.pickle', 'wb')
pickle.dump(ndtv_policy_names, f)
f.close()

f = open('data/HinduPolNames.pickle', 'wb')
pickle.dump(hindu_policy_names, f)
f.close()

f = open('data/HTPolNames.pickle', 'wb')
pickle.dump(ht_policy_names, f)
f.close()

f = open('data/IndiaNewsPolNames.pickle', 'wb')
pickle.dump(indiaNews_policy_names, f)
f.close()