In [1]:
import re
import json
import string
import random
import feedparser
from goose3 import Goose
from datetime import datetime
from tqdm import tqdm

from pprint import pprint

### Get URL 

In [2]:
def get_title_url(url):
    feed_link = feedparser.parse(url)

    link_per_cluster = []
    for fl in feed_link["items"]:
        f_link = fl["summary_detail"]["value"].split('href="')[-1].split(' ')[0]
        link = f_link.replace("google.com/","google.com/news/rss/")
        link_per_cluster.append(link)

    link_per_cluster_all = []
    for i in tqdm(range(len(link_per_cluster))):
        lpc = link_per_cluster[i]
        f_link = feedparser.parse(lpc)
        for fl in f_link['items']:
            title = fl['title']
            link = fl['links'][0]['href']
            host = link.split('//')[1].split('/')[0]
            publish_date = datetime.strptime(fl['published'], "%a, %d %b %Y %H:%M:%S GMT").strftime("%Y-%m-%d %H:%M:%S")

            link_per_cluster_all.append((title,link,host,publish_date))

    return link_per_cluster_all

def get_all_url():
    category = [['WORLD','Dunia'],['NATION','Indonesia'],['BUSINESS','Bisnis'],['ENTERTAINMENT','Hiburan'],
            ['TECHNOLOGY','Teknologi'],['SPORTS','Olahraga'],['SCIENCE','Science'],['HEALTH','Kesehatan']]
    
    all_url = []
    for c,y in category:
        print(y)
        url = """https://news.google.com/news/rss/headlines/section/topic/{category}.id_id/Indonesia?ned=id_id&hl=id&gl=ID""".format(category=c)
        res = get_title_url(url)
        fin_res = {
            "category": y,
            "items": res
        }
        all_url.append(fin_res)

    return all_url

In [3]:
all_url = get_all_url()

Dunia


100%|██████████| 20/20 [00:11<00:00,  1.68it/s]


Indonesia


100%|██████████| 20/20 [00:14<00:00,  1.40it/s]


Bisnis


100%|██████████| 20/20 [00:14<00:00,  1.40it/s]


Hiburan


100%|██████████| 20/20 [00:18<00:00,  1.11it/s]


Teknologi


100%|██████████| 15/15 [00:10<00:00,  1.47it/s]


Olahraga


100%|██████████| 20/20 [00:13<00:00,  1.50it/s]


Science


100%|██████████| 5/5 [00:02<00:00,  1.73it/s]


Kesehatan


100%|██████████| 10/10 [00:05<00:00,  1.86it/s]


### Get Content 

In [4]:
def remove_baca(text):
    baca_word = ['Informasi Menarik Terbaru','Membaca:','Baca juga','Baca :','BACA JUGA:','Penulis :','Penulis: ',
                'Artikel ini telah tayang di','Baca:','BACA :','Baca Juga:','Baca artikel sumber','Baca Selengkapnya:',
                 'Simak pula video pilihan berikut:','Saksikan tayangan video menarik berikut ini:',
                 'Saksikan Video Pilihan Berikut Ini:']

    junk_word = []
    for word in baca_word:
        for i in text.split('.'):
            if i.find(word) >= 0 :
                junk_word.append(i)

    for j in junk_word:
        text = text.replace(j, ' ')

    return text

def brut_split(text):
    for alf in list(string.ascii_uppercase):
        text = text.replace('.{a}'.format(a=alf),'. {a}'.format(a=alf))

    return text

def remove_publisher(text):
    pre = text[:100]
    suf = text[100:]
    try:
        try:
            final = pre.split(' - ')[1] + suf
            return final
        except:
            final = pre.split(' -')[1] + suf
            return final
    except:
        return text

In [5]:
def get_content(link):
    g = Goose({
                'use_meta_language': False, 
                'target_language':'id',
                'enable_image_fetching': False,
            })
    extract = g.extract(url=link)

    content = extract.cleaned_text
    content = remove_publisher(content)
    content = content.replace('."','. ')
    content = content.replace('\n',' ').replace('   ',' ').replace('  ',' ').replace("\'","").strip('-').strip()
    content = re.sub(r'[^\x00-\x7F]+', '', content)
    content = content.replace(' ...','.').replace('.. .','. ')
    content = brut_split(content)
    content = content.replace('.CO','').replace('.COM','').replace('. CO','').replace('. COM','')
    content = remove_baca(content)
    spoiler = content[:150] + '...'

    if len(content) <= 500:
        return "Not Valid"
    else:
        return content, spoiler

### HMM Tagger & Entity Determiner

In [6]:
# import dill
# import string
# from nltk import sent_tokenize
# from collections import defaultdict
# from nltk.tokenize import WordPunctTokenizer
# from nltk import word_tokenize, RegexpParser, ne_chunk
# from nltk.tag.hmm  import HiddenMarkovModelTagger, HiddenMarkovModelTrainer

In [7]:
# with open('my_tagger.dill', 'rb') as f:
#     hmm_tagger = dill.load(f)

In [8]:
# def hasNumbers(inputString):
#     result = False
#     for char in list(inputString):
#         if(char.isdigit()):
#             result = True
#     return result

# def xcheck_tag(word,tag):
#     punc = list(string.punctuation)
    
#     dates = ['Januari','Februari','Maret','April','Mei','Juni','Juli','Agustus','September','Oktober','November','Desember',\
#             'Jan','Feb','Mar','Apr','Mei','Jun','Jul','Agt','Sep','Okt','Nov','Des',\
#             'Senin','Selasa','Rabu','Kamis','Jumat','Sabtu','Minggu'
#         ]
#     stopword = open('stopword.txt','r').read().split('\n')
    
#     if(word.lower() in stopword):
#         tag = 'STOPWORD'
    
#     if(word in dates):
#         tag = 'DATE'
    
#     if(word in punc):
#         tag = 'Z'
        
#     if(tag == 'CD' and word.isdigit()):
#         tag = 'CD'
        
#     if(tag in ['SYM','Z','CD','MD'] and word.upper() != word and hasNumbers(word) == False \
#       and word[-3:] not in ['nya','kah','lah']):
#         tag = 'NNP'
    
#     if(tag == 'NN' and word[:1].upper() == word):
#         tag = 'NNP'
        
#     if(tag == 'NNP' and word.lower() == word):
#         tag = 'NN'
    
#     if(tag == 'NNP' and len(word) == 1):
#         tag = 'NN'
        
#     if(tag == 'FW' and word.lower() == word):
#         tag = 'NN'
        
#     return word,tag

# def get_entity(wording, verbose=False):
#     word_punct_tokenizer = WordPunctTokenizer()
#     tokenized = word_punct_tokenizer.tokenize(wording)
#     final_tagged = []
#     pos_tagging_begin = hmm_tagger.tag(tokenized)
#     for ptb in pos_tagging_begin:
#         w,t = xcheck_tag(ptb[0],ptb[1])
#         final_tagged.append((w,t))
    
#     if(verbose):
#         print(pos_tagging_begin)
    
#     grammar = """
#     ENTITY : {<NNP>+}
#     ENTITY : {<FW>+}
#     """
#     result = []
#     chunkParser = RegexpParser(grammar)
#     tree = chunkParser.parse(final_tagged)
#     for subtree in tree.subtrees():
#         if(subtree.label()=="ENTITY"):
#             tampung_entity = []
#             for se in subtree.leaves():
#                 tampung_entity.append(se[0])
#             result.append(' '.join(tampung_entity))
#     return result

# def sorting(xsz, top=10):
#     counts = defaultdict(int)
#     for xs in xsz:
#         for x in xs:
#             counts[x] += 1
#     return sorted(counts.items(), reverse=True, key=lambda tup: tup[1])[:top]

### Test

In [9]:
# for l in all_url[1:2]:
#     final_result = []
#     print(l['category'])
#     print()
    
#     for item in l['items'][:1]:
#         print(item)
#         print()
#         title = item[0]
#         link = item[1]
#         content = get_content(link)
#         print(title)
#         print()
#         print(link)
#         print()
#         print(content)

In [10]:
# senttok = sent_tokenize(content)

In [11]:
# all_entity = []
# for xsent in senttok:
#     if len(xsent) > 1:
#         ent = get_entity(xsent, verbose=False)
#         all_entity.append(ent)
        
#         xsent_final = xsent
#         duplicate_word = []
#         for e in ent:
#             if e in duplicate_word:
#                 pass
#             else:
#                 duplicate_word.append(e)
#                 xsent_final = xsent_final.replace(e, "<mark>{e}</mark>".format(e=e))
            
#         print("<p>" + xsent_final + "</p>")
#         print()

In [12]:
# sorting(all_entity,100)

### Spacy NER 

In [13]:
import spacy
from nltk import sent_tokenize
nlp = spacy.load('id')

In [14]:
import spacy
from nltk import sent_tokenize
nlp = spacy.load('id')

def edit_tagged_content(content):
    senttok = sent_tokenize(content)
    
    for i in range(len(senttok)):
        try:
            s = senttok[i]
            if s.count('"') == 1:
                senttok[i] = s + ' ' + senttok[i + 1]
                del senttok[i + 1]
        except:
            pass
        
    result_text = []
    result_ent = []
    result_ent_label = []
    for xsent in senttok:
        if len(xsent) > 1:
            doc = nlp(xsent)

            entity = []
            all_ent = []
            for e in doc.ents:
                entity.append((e.text,e.label_))
                all_ent.append(e.text)

            xsent_final = xsent
            duplicate_word = []

            for ent, tag in entity:
                if ent in duplicate_word:
                    pass
                else:
                    duplicate_word.append(ent)
                    xsent_final = xsent_final.replace(ent, '<mark class="mark {tag}">{ent}<span class="tag">{tag}</span></mark>'.format(ent=ent, tag=tag))


            result_text.append("<p>" + xsent_final + "</p>")
            result_ent.append(all_ent)
            result_ent_label.append(entity)
    return (result_text,result_ent, result_ent_label)

### Save to DB

In [15]:
import MySQLdb as mdb

db_host = '127.0.0.1'
db_user = 'root'
db_password = 'qwerty'
db_name = 'entity_determiner'
db_charset = 'utf8'

In [16]:
science = all_url[6]

In [17]:
science

{'category': 'Science',
 'items': [('Bakal Ada Gerhana Bulan Total Tanggal 28 Juli 2018, Simak 6 Tips Memotret Blood Moon Pakai Ponsel',
   'http://jatim.tribunnews.com/2018/07/19/bakal-ada-gerhana-bulan-total-tanggal-28-juli-2018-simak-6-tips-memotret-blood-moon-pakai-ponsel',
   'jatim.tribunnews.com',
   '2018-07-19 11:07:35'),
  ('Gerhana Bulan Total 28 Juli 2018, Catat Waktu Proses Awal Hingga Puncaknya',
   'http://jambi.tribunnews.com/2018/07/19/gerhana-bulan-total-28-juli-2018-catat-waktu-proses-awal-hingga-puncaknya',
   'jambi.tribunnews.com',
   '2018-07-19 10:25:37'),
  ('Info BMKG - Ini Proses Gerhana Bulan Total Blood Moon 28 Juli 2018, Perhatikan Waktu Puncaknya!',
   'http://makassar.tribunnews.com/2018/07/19/info-bmkg-ini-proses-gerhana-bulan-total-blood-moon-28-juli-2018-perhatikan-waktu-puncaknya',
   'makassar.tribunnews.com',
   '2018-07-19 09:37:00'),
  ('Prediksi GBT 28 Juli Jadi yang Terlama Sepanjang Abad 21, BMKG akan Siarkan Secara Langsung',
   'http://jaten

In [18]:
for au in science:
    news = science['items']
    save_to_db = []
    for i in tqdm(range(len(news))):
        d = news[i]
        try:
            title, url, host, published_at = d
            clean_content, spoiler_content = get_content(url)
            tagged_content, entity, entity_label = edit_tagged_content(clean_content)
            tagged_content = '\n'.join(tagged_content)
            
            final_ent = []
            for ent in entity:
                for e in ent:
                    final_ent.append(e)
            entity = '*'.join(final_ent)
            
            final_ent_label = []
            for entl in entity_label:
                for ent in entl:
                    final_ent_label.append(ent[0] + '#')
                    final_ent_label.append(ent[1] + '*')
                    
            entity_label = ''.join(final_ent_label)
            save_to_db.append((title, clean_content, tagged_content, spoiler_content, entity, entity_label, url, host, published_at))
        except:
            pass

    print("Total data : %d" % len(save_to_db))
    query = """INSERT IGNORE INTO {table}
        (title, clean_content, tagged_content, spoiler_content, entity, entity_label, url, host, published_at)
        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)""".format(table=science['category'].lower())
    attr = tuple(save_to_db)

    connect = mdb.connect(db_host, db_user, db_password, db_name, charset=db_charset)
    cursor = connect.cursor()
    cursor.executemany(query, attr)
    connect.commit()
    connect.close()

100%|██████████| 15/15 [00:24<00:00,  1.62s/it]
  0%|          | 0/15 [00:00<?, ?it/s]

Total data : 15


100%|██████████| 15/15 [00:20<00:00,  1.37s/it]

Total data : 15



