In [1]:
import re
import json
import string
import random
import feedparser
from goose3 import Goose
from datetime import datetime
from tqdm import tqdm

from pprint import pprint

### Get URL 

In [8]:
def get_title_url(url):
    feed_link = feedparser.parse(url)

    link_per_cluster = []
    for fl in feed_link["items"]:
        f_link = fl["summary_detail"]["value"].split('href="')[-1].split(' ')[0]
        link = f_link.replace("google.com/","google.com/news/rss/")
        link_per_cluster.append(link)

    link_per_cluster_all = []
    for i in tqdm(range(len(link_per_cluster))):
        lpc = link_per_cluster[i]
        f_link = feedparser.parse(lpc)
        for fl in f_link['items']:
            title = fl['title']
            link = fl['links'][0]['href']
            host = link.split('//')[1].split('/')[0]
            publish_date = datetime.strptime(fl['published'], "%a, %d %b %Y %H:%M:%S GMT").strftime("%Y-%m-%d %H:%M:%S")

            link_per_cluster_all.append((title,link,host,publish_date))

    return link_per_cluster_all

def get_all_url():
    category = [['WORLD','Dunia'],['NATION','Indonesia'],['BUSINESS','Bisnis'],['ENTERTAINMENT','Hiburan'],
            ['TECHNOLOGY','Teknologi'],['SPORTS','Olahraga'],['SCIENCE','Science'],['HEALTH','Kesehatan']]
    
    all_url = []
    for c,y in category[:1]:
        print(y)
        url = """https://news.google.com/news/rss/headlines/section/topic/{category}.id_id/Indonesia?ned=id_id&hl=id&gl=ID""".format(category=c)
        res = get_title_url(url)
        fin_res = {
            "category": y,
            "items": res
        }
        all_url.append(fin_res)

    return all_url

In [9]:
all_url = get_all_url()

Dunia


100%|██████████| 20/20 [00:12<00:00,  1.58it/s]


### Get Content 

In [10]:
def remove_baca(text):
    baca_word = ['Informasi Menarik Terbaru','Membaca:','Baca juga','Baca :','BACA JUGA:','Penulis :','Penulis: ',
                'Artikel ini telah tayang di','Baca:','BACA :','Baca Juga:','Baca artikel sumber','Baca Selengkapnya:',
                 'Simak pula video pilihan berikut:','Saksikan tayangan video menarik berikut ini:',
                 'Saksikan Video Pilihan Berikut Ini:']

    junk_word = []
    for word in baca_word:
        for i in text.split('.'):
            if i.find(word) >= 0 :
                junk_word.append(i)

    for j in junk_word:
        text = text.replace(j, ' ')

    return text

def brut_split(text):
    for alf in list(string.ascii_uppercase):
        text = text.replace('.{a}'.format(a=alf),'. {a}'.format(a=alf))

    return text

def remove_publisher(text):
    pre = text[:100]
    suf = text[100:]
    try:
        try:
            final = pre.split(' - ')[1] + suf
            return final
        except:
            final = pre.split(' -')[1] + suf
            return final
    except:
        return text

In [18]:
def get_content(link):
    g = Goose({
                'use_meta_language': False, 
                'target_language':'id',
                'enable_image_fetching': True,
            })
    extract = g.extract(url=link)
    
    content = extract.cleaned_text
    content = remove_publisher(content)
    content = content.replace('."','. ')
    content = content.replace('\n',' ').replace('   ',' ').replace('  ',' ').replace("\'","").strip('-').strip()
    content = re.sub(r'[^\x00-\x7F]+', '', content)
    content = content.replace(' ...','.').replace('.. .','. ')
    content = brut_split(content)
    content = content.replace('.CO','').replace('.COM','').replace('. CO','').replace('. COM','')
    content = remove_baca(content)
    spoiler = content[:150] + '...'
    image = extract.top_image
    image_src = image.src

    if len(content) <= 500:
        return "Not Valid"
    else:
        return content, spoiler, image_src

### Spacy NER 

In [20]:
import spacy
from nltk import sent_tokenize
nlp = spacy.load('id')

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [21]:
import spacy
from nltk import sent_tokenize
nlp = spacy.load('id')

def edit_tagged_content(content):
    senttok = sent_tokenize(content)
    
    for i in range(len(senttok)):
        try:
            s = senttok[i]
            if s.count('"') == 1:
                senttok[i] = s + ' ' + senttok[i + 1]
                del senttok[i + 1]
        except:
            pass
        
    result_text = []
    result_ent = []
    result_ent_label = []
    for xsent in senttok:
        if len(xsent) > 1:
            doc = nlp(xsent)

            entity = []
            all_ent = []
            for e in doc.ents:
                entity.append((e.text,e.label_))
                all_ent.append(e.text)

            xsent_final = xsent
            duplicate_word = []

            for ent, tag in entity:
                if ent in duplicate_word:
                    pass
                else:
                    duplicate_word.append(ent)
                    xsent_final = xsent_final.replace(ent, '<mark class="mark {tag}">{ent}<span class="tag">{tag}</span></mark>'.format(ent=ent, tag=tag))


            result_text.append("<p>" + xsent_final + "</p>")
            result_ent.append(all_ent)
            result_ent_label.append(entity)
    return (result_text,result_ent, result_ent_label)

### Save to DB

In [15]:
import MySQLdb as mdb

db_host = '127.0.0.1'
db_user = 'root'
db_password = 'qwerty'
db_name = 'entity_determiner'
db_charset = 'utf8'

In [18]:
for au in all_url:
    news = au['items']
    save_to_db = []
    for i in tqdm(range(len(news))):
        d = news[i]
        try:
            title, url, host, published_at = d
            clean_content, spoiler_content, img_url = get_content(url)
            tagged_content, entity, entity_label = edit_tagged_content(clean_content)
            tagged_content = '\n'.join(tagged_content)
            
            final_ent = []
            for ent in entity:
                for e in ent:
                    final_ent.append(e)
            entity = '*'.join(final_ent)
            
            final_ent_label = []
            for entl in entity_label:
                for ent in entl:
                    final_ent_label.append(ent[0] + '#')
                    final_ent_label.append(ent[1] + '*')
                    
            entity_label = ''.join(final_ent_label)
            save_to_db.append((title, clean_content, tagged_content, spoiler_content, entity, entity_label, url, host, published_at, img_url))
        except:
            pass

    print("Total data : %d" % len(save_to_db))
    query = """INSERT IGNORE INTO {table}
        (title, clean_content, tagged_content, spoiler_content, entity, entity_label, url, host, published_at, img_url)
        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""".format(table=science['category'].lower())
    attr = tuple(save_to_db)

    connect = mdb.connect(db_host, db_user, db_password, db_name, charset=db_charset)
    cursor = connect.cursor()
    cursor.executemany(query, attr)
    connect.commit()
    connect.close()

100%|██████████| 15/15 [00:24<00:00,  1.62s/it]
  0%|          | 0/15 [00:00<?, ?it/s]

Total data : 15


100%|██████████| 15/15 [00:20<00:00,  1.37s/it]

Total data : 15



