# Accidental News Extraction

In [1]:
import re 
import string 
import nltk 
import requests
import bs4
import spacy 
from spacy import displacy
from nltk.sem import relextract
import pandas as pd 
import numpy as np 
from nltk import sent_tokenize,word_tokenize,ne_chunk,pos_tag
from nltk.corpus import stopwords
from nltk import Tree
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stopword = stopwords.words('english')

In [2]:
def cleanify(text):
    text_corr = re.sub(r'\[.*?\]+', '', text)
    text_corr = text_corr.replace('\n', '')
    return text_corr

In [3]:
def date(text):
    x = re.search(r"(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(?:Nov|Dec)(?:ember)?)\D?(\b\d{1,2}\D{0,3})",text)
    if x:
        b=x.span()
        x = re.sub(r"(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(?:Nov|Dec)(?:ember)?)\D?(\b\d{1,2}\D{0,3})","",text)
        return text[b[0]:b[1]]

In [4]:
def tokenize(text):
    sentences = nltk.sent_tokenize(text)
    words = [nltk.word_tokenize(sent) for sent in sentences]
    tag = [nltk.pos_tag(sent) for sent in words]
    return sentences,words,tag

In [5]:
def all_days(text):
    x = re.search(r"(?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday)",text)
    if x:
        b=x.span()
        return text[b[0]:b[1]]

In [6]:
def get_continuous_chunks(text, label):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []

    for subtree in chunked:
        if type(subtree) == Tree and subtree.label() == label:
            current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
        if current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk

In [7]:
def vehicles(words):
    vehicle= set(['train','metro','trolley','airplane','aircraft','drone','rocket','helicopter','chopper','bike','scooter','scooty','motorbike','motorcycle','two-wheeler','two wheeler','moped','tempo','three-wheeler','three wheeler','bus','car','truck','tipper','zeep','container','SUV','tractor','moped','lorry', 'minivan','minibus','trolley','four-wheeler','four wheeler','jeep' ])
    vehicle_involved = []
    
    for i in words:
        for j in i:
            if (j in stopword) or len(j)<=1:
                i.remove(j)
            elif ((j in vehicle) or(lemmatizer.lemmatize(j) in vehicle)):
                vehicle_involved.append(lemmatizer.lemmatize(j))
            
    return vehicle_involved

In [8]:
def vehicle_t(vehicle_involved):
    
    four_wheeler = set([ 'bus','car','truck','tipper','zeep','container','SUV','tractor','moped','lorry', 'minivan','minibus','trolley','four-wheeler','four wheeler','jeep' ])
    three_wheeler=set([ 'tempo','three-wheeler','three wheeler' ])
    two_wheeler = set([ 'bike','scooter','scooty','motorbike','motorcycle','two-wheeler','two wheeler','moped' ])    
    air_vehicle = set(['airplane','aircraft','drone','rocket','helicopter','chopper'])
    rail_vehicle=set(['train','metro','trolley'])    
    
    
    vehicle_type={'four_wheeler':0,'three_wheeler':0,'two_wheeler':0,'air_vehicle':0,'rail_vehicle':0}
    for i in vehicle_involved:
        if i in four_wheeler:
            vehicle_type['four_wheeler']+=1
        elif i in three_wheeler:
            vehicle_type['three_wheeler']+=1
        elif i in two_wheeler:
            vehicle_type['two_wheeler']+=1
        elif i in air_vehicle:
            vehicle_type['air_vehicle']+=1
        elif i in rail_vehicle:
            vehicle_type['rail_vehicle']+=1
    return vehicle_type

In [9]:
def casualities(tag):
    heya = r"""
           moment:{<CD><NNS>?<NN>?<NNP>?<NNPS>?<VBP>?<VBG>?<VBD>?<RB>?<VBN>?<VBZ>?}
           {<NNS>?<NN>?<NNP>?<NNPS>?<VBD>?<VBN>?<VBZ>?<CD><VBD>?<RB>?<VBN>?<VBZ>}
    """

    casuality = []

    cp=nltk.RegexpParser(heya)
    for i in tag:
        result = cp.parse(i)
        for subtree in result.subtrees():
            
            if(subtree.label() =='moment'):
                
                temp={}
                flag=0
                for i in range(len(subtree)):
                    temp[subtree[i][0]]=subtree[i][1]
                    if(subtree[i][0] in ['died','injured','killed',"murdered","suicide",'lost','treated']):
                        flag=1
                if flag==1:
                    casuality.append(temp)
    return casuality

In [10]:
def death_injury_count(casualities):
    numbers={'a':1,'one':1,'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9,'ten':10,'eleven':11,'twelve':12,'thirteen':13,'fourteen':14,'fifteen':15,'sixteen':16,'eighteen':18,'ninteen':19,'tewnty':20}
    temp_var=0
    death_count=0
    injury_count=0
    for moment in casualities:
        for i in moment:
            if moment[i]=='CD':
                if i.lower() in numbers:
                    temp_var=numbers[i.lower()]
                elif i.isalnum():
                    continue
                else:
                    temp_var=int(i)
                if ('died'in moment) or ('killed'in moment) or ('suicide'in moment) or ('murdered'in moment) or ('lost' in moment):
                    if i!=death_count:
                        death_count += temp_var

                if ('injured'in moment) or ('treated' in moment):
                    if i!=injury_count:
                        injury_count += temp_var
                        
    return death_count,injury_count
            


## Program flow starts here

In [11]:
def scrapArticle(url):
    response = requests.get(
        url=url
    )
    text=""
    
    soup = bs4.BeautifulSoup(response.content, 'html.parser')
    return soup.find_all('div',class_='_3YYSt clearfix')[0].getText()

In [12]:
def getLinks(url):
    li=[]
    text=""
    response = requests.get(
        url=url
    )
    soup = bs4.BeautifulSoup(response.content, 'html.parser')
    
    
    title = soup.find_all('div', class_='Mc7GB')
    for i in title:
        tag = i.a
        link = tag['href']
        li.append(link)
    
    return li

In [13]:
# Scrapping TOI accidental news page and getting all the links of news.

links=getLinks("https://timesofindia.indiatimes.com/topic/Accident/news")
links.remove(links[9])

In [15]:
information=[]
for i in links:
    # temp is a temporary dictionary where we store extracted information and append the dictionary into a list.
    temp={}      
    
    text = scrapArticle(i)
    
    # Using regex removing the punctuations, /n etc
    text = cleanify(text)
    
    # Fetching the days and dates form the text using regex
    dates=date(text)
    days=all_days(text)
    
    # Getting sentences, words and tags out of the text
    sentences,words,tag = tokenize(text)
    
    # Using (nltk.ne_chunk  ) getting places, organizations and persons
    places =get_continuous_chunks(text,'GPE')
    org = get_continuous_chunks(text,'ORGANIZATION')
    person = get_continuous_chunks(text,'PERSON')
    
    # Fetching vehicle information from the text using self defined functions
    vehicles_involved=vehicles(words)
    vehicles_type = vehicle_t(vehicles_involved)
    
    # Using Chunking approch, identifying the casuality sentences and getting death count and injury count
    casuality = casualities(tag)
    death_counts,injury_counts = death_injury_count(casuality)
    
    
    # Saving all the information into the temp dictionary
    temp["dates"]=dates
    temp['day']=days
    temp['places']=list(set(places))
    temp['persons']=list(set(person))
    temp['Organization_involved']=list(set(org))
    temp["vehicle_involved"]=list(set(vehicles_involved))
    temp["vehicles_type"]=vehicles_type
    temp["death_count"]=death_counts
    temp["injury_count"]=injury_counts
    
    # Appending dictionary to list
    information.append(temp)
    


In [16]:
df = pd.DataFrame(information)
df

Unnamed: 0,dates,day,places,persons,Organization_involved,vehicle_involved,vehicles_type,death_count,injury_count
0,,Saturday,[Ghugus],"[Cops, Rupesh Barsagade, Bairam Baba, Sources,...",[WCL Wani],"[truck, tractor]","{'four_wheeler': 3, 'three_wheeler': 0, 'two_w...",2,1
1,,Sunday,"[Dundigal Ganesh, Nizampet, Bowrampet, Bowramp...","[Sai, S Ashok, P Ganesh, Ashok, N Sanjay, Prag...","[Pragati Nagar, Dundigal, IPC]","[car, truck]","{'four_wheeler': 7, 'three_wheeler': 0, 'two_w...",3,0
2,,Wednesday,"[Behala, Baghajatin]","[Sitala Mandir Road, Rahul Majumdar, Sumit Hal...","[Diamond Harbour CMRI Hosptal, SDPO, Diamond H...","[car, truck]","{'four_wheeler': 3, 'three_wheeler': 0, 'two_w...",0,0
3,,Wednesday,"[Behala, Baghajatin]","[Sitala Mandir Road, Rahul Majumdar, Sumit Hal...","[Diamond Harbour CMRI Hosptal, SDPO, Diamond H...","[car, truck]","{'four_wheeler': 3, 'three_wheeler': 0, 'two_w...",0,0
4,,Thursday,"[Kishangarh, Kishangarh Haryana]",[],"[Jaisalmer, BSF Barmer DIG Vineet Kumar, Kisha...","[car, bus]","{'four_wheeler': 3, 'three_wheeler': 0, 'two_w...",0,2
5,,Tuesday,"[Saraiyan, District, SHIMLA, Chowari, Himachal...","[College Tanda, Chowari Bakloh, Chowari, Prasa...",[],[],"{'four_wheeler': 0, 'three_wheeler': 0, 'two_w...",2,0
6,,Thursday,"[MUMBAI, Trombay, Ghatkopar, Mumbai, Mulund]","[Sinha, Global Road Safety, Sinha Males, Rajva...","[Motor Vehicles, Bloomberg Philanthropies Init...",[],"{'four_wheeler': 0, 'three_wheeler': 0, 'two_w...",0,0
7,December 13 wi,Sunday,"[Ahmedabad, Gandhinagar Canada, Gandhinagar, U...","[Dhvanil Raval, Pooja Jayram Vamja, Vivan Vamj...","[CG Road ICU, Uvarsad, CG Road, Adalaj, FIR, F...",[car],"{'four_wheeler': 1, 'three_wheeler': 0, 'two_w...",0,0
8,,Tuesday,[],"[Omprakash, Omparkash]","[Sardar Patel Marg, NEW, CISF RML, DELHI, CISF...",[truck],"{'four_wheeler': 3, 'three_wheeler': 0, 'two_w...",0,0
9,,Monday,"[Canacona, August, Vasco, Manipur, Manipur Sou...","[Dessai, Laishram, Eliza Devi Laishram, Dessai...","[Goa International Airport, Dabolim]",[],"{'four_wheeler': 0, 'three_wheeler': 0, 'two_w...",0,0


In [17]:
print(scrapArticle(links[9]))

Vasco: Eliza Devi Laishram, 19, a native of Manipur, died on the spot in an accident on Monday morning at Dabolim. Laishram, who had a flight to Manipur, was travelling to the airport, along with two others, when their car rammed into a parked trailer. Vasco police said Laishram, along with one other, had comefor an internship at a hotel in South Goa. “She had joined in August and finished her internship in the last week of November,” police said. Laishram and her intern friend were accompanied by Dattaram Dessai from Canacona, who works in the same hotel. “Dessai was driving the car when he lost control and rammed the car into a parked trailer around 700m before the Goa International Airport. The deceased was sitting behind,” police said. Dessai and the other man were shifted to a hospital and were in a critical condition. Deputy sarpanch of Chicalim panchayat Kamla Prasad Yadav, said, “The parked vehicles create nuisance for others. Many accidents have occurred due to parking of such

In [18]:
information[9]

{'dates': None,
 'day': 'Monday',
 'places': ['Canacona',
  'August',
  'Vasco',
  'Manipur',
  'Manipur South Goa',
  'Chicalim'],
 'persons': ['Dessai',
  'Laishram',
  'Eliza Devi Laishram',
  'Dessai Kamla Prasad Yadav',
  'Dattaram Dessai',
  'Laishram Laishram'],
 'Organization_involved': ['Goa International Airport', 'Dabolim'],
 'vehicle_involved': [],
 'vehicles_type': {'four_wheeler': 0,
  'three_wheeler': 0,
  'two_wheeler': 0,
  'air_vehicle': 0,
  'rail_vehicle': 0},
 'death_count': 0,
 'injury_count': 0}

In [19]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(scrapArticle(links[9]))

In [20]:
for chunk in doc.ents:
    if chunk.label_ == "PERSON":
        print(chunk.text)

Dessai
Kamla Prasad


In [None]:
displacy.serve(doc, style="ent")




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...



In [None]:
print("j")