Capture news from RTHK news website with BeautifulSoup

In [None]:
from bs4 import BeautifulSoup
import requests
import csv
import re
from calendar import monthrange
import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
year=2020

def number_of_days_in_month(year=2020, month=2):
    return monthrange(year, month)[1]



def capture(year=2021):
    domain = "https://news.rthk.hk"
    csv_file = open(f'RTHK_news_{year}.csv', 'w', encoding="utf-8")
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Title','Corpus'])
    for i in range(1,13):
        num_of_days = number_of_days_in_month(year, i)
        print(f"Capturing month {str(i)}")
        if len(str(i))<2:        
            month = "0"+str(i)

        for j in range(1,num_of_days):
            day = str(j)
            if len(day)<2:
                day = "0"+day

            link = f'https://news.rthk.hk/rthk/en/news-archive.htm?archive_year={year}&archive_month={month}&archive_day={day}&archive_cat=8'        
            source = requests.get(link).text
            soup = BeautifulSoup(source, 'lxml')

            for span in soup.find_all('span',class_='title'):
                corpus_link = domain+span.a.attrs['href']

                # get corpus
                source = requests.get(corpus_link).text
                soup = BeautifulSoup(source, 'lxml')

                div = soup.find('div')
                div_cls = div.find('div',class_='itemFullText')
                div_totext = str(div_cls.prettify())


                remove_tag = re.compile('<.*?>')
                unformat_text = re.sub(remove_tag,'',div_totext)
                plain_text = re.sub('\s+', ' ', unformat_text)                       
                csv_writer.writerow([str(span.a.contents[0]),str(plain_text)])

    csv_file.close()
#capture(year)

<br>
<br>
<br>
Load RTHK news of 2020

In [None]:
import pandas as pd

def readcsv(file):
    RTHK_news = pd.read_csv(file, encoding = "utf-8")
    return RTHK_news

RTHK_news = readcsv("RTHK_news_2020_old.csv")
print(len(RTHK_news))
RTHK_news.head()

<br>
<br>
<br>
Remove unnecessary text and feature with regular expression

In [None]:
import re

# process with regular expression
a = RTHK_news.Corpus

# remove synatx like ______________________________ Last updated: 2021-01-02 HKT 17:10 
a = a.map(lambda x: re.sub('[_]+\s[A-Za-z]+\s[a-z]+:\s[-0-9]+\s[A-Z]+\s[:0-9]+',"", x))

# remove all punctuation
a = a.map(lambda x: re.sub('[”“–,\.!?\"\(\)\-\[\]\{\};:|<>@$%^&*_~\`]',"", x))

# remove all digit
a = a.map(lambda x: re.sub('[0-9]+',"", x))

# remove all double space
a = a.map(lambda x: re.sub('\s+', ' ', x))

# all lower case
a = a.map(lambda x: x.lower())

# replace ’ with '
a = a.map(lambda x: re.sub('’',"'", x))

# replace 'news reporter'  with  news reporter
a = a.map(lambda x: re.sub('\'(([a-z]+\s)*[a-z]+)\'',r'\1',x))

# remove head and tail space of corpus
a = a.map(lambda x: re.sub(r"^\s+|\s+$", "", x))

# final corpus
p_corpus = a.copy()

print(p_corpus)

<br>
<br>
<br>
Remove Stop word and tokenize text

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

def add_stop_words(nlp,words):
    for i in range(len(words)):
        nlp.Defaults.stop_words.add(words[i])
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    return stop_words


def spacy_tokenizer_(sentence,stop_words):
    
    parser = English()

    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)
    
    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
        
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words]
    
    # return preprocessed list of tokens
    return mytokens


stop_words = add_stop_words(spacy.load('en_core_web_sm'),['hong','kong','said','saying','people','hk'])
tokenize_text = p_corpus.apply(lambda x:spacy_tokenizer_(x,stop_words))
print(tokenize_text)

<br>
<br>
<br>
Make dictionary and document term matrix

In [None]:
from gensim import models,corpora

def gensim_dict_term_matrix(text):
    
    dictionary = corpora.Dictionary(text)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in text]
    
    return dictionary,doc_term_matrix
 

# Make Dictionary and Dictionary - Term Matrix 
dictionary, doc_term_matrix = gensim_dict_term_matrix(tokenize_text)
print(dictionary)

<br>
<br>
<br>
Function start in here if u want to view the result please go to the bottom

In [None]:
# common function
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook,tnrange,tqdm
import gensim
from gensim import models
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import seaborn as sns


# return the range form lower no of topics to upper no of topics
def topic_list(lower,upper):
    return np.arange(lower,upper)



# Get topic distribution on each corpus
def topic_distribution(lda, doc_term_matrix):
    
    topic_dis_perdoc = lda.get_document_topics(doc_term_matrix)
    return topic_dis_perdoc



# Save the document into csv file with the most possible topics
def classify_corpus(best_topics, topic_dis_perdoc ,corpus_len):
    
    # assign Topic_corpus
    Topic_corpus={}    
    for r in range(best_topics):       
        Topic_corpus[r]={'Title':[], 'Corpus':[]}    
    
    for k in range(corpus_len):        
        index_topic = 0
        largest = 0 
        
        for l in topic_dis_perdoc[k]:

            if l[1] > largest:
                largest = l[1]
                index_topic = l[0]

        Topic_corpus[index_topic]['Title'].append(RTHK_news.Title[k])
        Topic_corpus[index_topic]['Corpus'].append(RTHK_news.Corpus[k])
    
    return Topic_corpus

<br>
<br>
<br>
Optimize with ntopic

In [None]:
# find best lda object
def find_optimize_with_ntopic(num_topics_list, doc_term_matrix, dictionary):  
    
    Lda = models.LdaMulticore
    
    # umass coherence score
    coherenceList_umass = []
    
    # for save lda object
    lda_array = []
 
    for num_topics in tqdm(num_topics_list):
        
        lda= Lda(doc_term_matrix, num_topics=num_topics,id2word = dictionary,    
        passes=20,chunksize=4000,random_state=43)
    
        lda_array.append(lda)
        cm = CoherenceModel(model=lda, corpus=doc_term_matrix, dictionary=dictionary, coherence='u_mass')
        
        coherenceList_umass.append(cm.get_coherence())
        
    return coherenceList_umass, lda_array



# find best topic with smallest_coherence score
def find_best_topics(coherenceList_umass,num_topics_list):

    best_topics = num_topics_list[0]
    smallest_coherence = 0
    for i,j in zip(coherenceList_umass,num_topics_list):
        if i < smallest_coherence:
            smallest_coherence = i
            best_topics = j

    return smallest_coherence, best_topics



# Get the best lda object (for n topics method only)
def get_lda_object(best_topics, least_topic, lda_array):
    
    return lda_array[best_topics-least_topic]



# plot n topics
def plot_topics_by_score(plot,num_topics_list,coherenceList_umass):
    
    # plot topics by score
    plotData = pd.DataFrame({'Number of topics' : num_topics_list, 'CoherenceScore' : coherenceList_umass})
    f, ax = plt.subplots(figsize=(10,6))
    
    sns.set_style("darkgrid")
    sns.pointplot(x = 'Number of topics', y = 'CoherenceScore', data = plotData)
    
    plt.axhline(y = -3.9)
    plt.title('Topic coherence')
    
    if plot : plt.savefig('Topic coherence plot.png')
        
        

# find best topic with differnt number of topics only
def optimize_by_topics(p_corpus, doc_term_matrix, dictionary, tokenize_text, mininum=3, maximum=8, plot=0):
    
    corpus_len = len(p_corpus)
    num_topics_list = topic_list(mininum,maximum)
    
    if mininum < 3:
        mininum = 3
        
    if maximum < 4:
        maximum = 4
                        
    coherenceList_umass, lda_array = find_optimize_with_ntopic(num_topics_list,doc_term_matrix,dictionary)
    smallest_coherence, best_topics= find_best_topics(coherenceList_umass,num_topics_list)
        
    if plot == 1:
        
        print("Best no of topics: ",best_topics)       
        plot_topics_by_score(0,num_topics_list,coherenceList_umass)
        
                
    # find the best lda object
    lda_ntopics = get_lda_object(best_topics, 3, lda_array)
        
    # Topic distribution
    topic_dis_perdoc = topic_distribution(lda_ntopics, doc_term_matrix) 
        
    # Classified Topic_corpus with k topics
    Topic_corpus = classify_corpus(best_topics, topic_dis_perdoc, corpus_len)
        
    return lda_ntopics, Topic_corpus, best_topics
    
a,b,c = optimize_by_topics(p_corpus, doc_term_matrix, dictionary, tokenize_text, mininum=3, maximum=8, plot=0)

<br>
<br>
<br>
Optimize with n topics, alpha and beta(sometimes called eta)

In [None]:
#set alpha and beta
def set_alpha_beta(min,max,range):
    
    # Alpha parameter from 0.01 to 1 with 0.3 interval
    alpha = list(np.arange(0.01, 1, 0.3))
    alpha.append('symmetric')
    alpha.append('asymmetric')
    
    # Beta parameter from 0.01 to 1 with 0.3 interval
    beta = list(np.arange(0.01, 1, 0.3))
    beta.append('symmetric')
    
    return alpha, beta


def compute_coherence_values(tokenize_text, corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenize_text, dictionary=dictionary, coherence='c_v')
    return coherence_model_lda.get_coherence()



def find_optimize_with_alpha_beta_ntopics(alpha, beta, topics_range, corpus, dictionary, tokenize_text):
    
    model_results = {'Topics': [],
                     'Alpha': [],
                     'Beta': [],
                     'Coherence': []
                    }
    

    for k in tqdm(topics_range):
        for a in alpha:
            for b in beta:
                cv = compute_coherence_values(tokenize_text = tokenize_text,corpus = doc_term_matrix, dictionary = dictionary, k = k, a = a, b = b)
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)


    pd.DataFrame(model_results).to_csv('lda_tuning_results_test.csv', index=False)




# find largest tuning score
def find_largest_tuning_score(tuning_result):
    largest  =  0
    largest_index =  0
    for i,j in zip(tuning_result.index.values.astype(int),tuning_result.Coherence):
        if j>largest:
            largest = j
            largest_index = i
            
    return largest_index




def optimize_by_alpha_beta(p_corpus, doc_term_matrix, dictionary, tokenize_text, mininum=3, maximum=8, debug=1):

    corpus_len = len(p_corpus)
    num_topics_list = topic_list(mininum,maximum)
    
    if mininum < 3:
        mininum = 3
        
    if maximum < 4:
        maximum = 4

    alpha,beta = set_alpha_beta(0.01,1,0.3) 
        
    # Warning long processing time
    if debug==0:        
        find_optimize_with_alpha_beta_ntopics(alpha, beta, num_topics_list, doc_term_matrix, dictionary, tokenize_text)
        
    tuning_result = pd.read_csv('lda_tuning_results1.csv', encoding = "utf-8")
    largest_index = find_largest_tuning_score(tuning_result)

    
        
    # tune up parameter
    tuneup_topics = int(tuning_result.Topics[largest_index])  

    
    # find best alpha
    tuneup_alpha = tuning_result.Alpha[largest_index]              
    if tuning_result.Alpha[largest_index] != ('symmetric' or 'asymmetric'):        
        tuneup_alpha = float(tuning_result.Alpha[largest_index])
 

    # find best beta       
    tuneup_beta = tuning_result.Beta[largest_index]
    if tuning_result.Beta[largest_index] != ('symmetric'or 'asymmetric'):
        tuneup_beta = float(tuning_result.Beta[largest_index])        

        
    # Compute Lda
    Lda = models.LdaMulticore
    lda_alpha_beta = Lda(doc_term_matrix, num_topics= tuneup_topics, id2word = dictionary, alpha = tuneup_alpha, eta = tuneup_beta, passes = 20, chunksize = 4000, random_state = 43)
        
    # Topic distribution
    topic_dis_perdoc = topic_distribution(lda_alpha_beta, doc_term_matrix)

    # Classified Topic_corpus with k topics
    Topic_corpus = classify_corpus(tuneup_topics, topic_dis_perdoc, corpus_len) 
    
    return lda_alpha_beta, Topic_corpus, tuneup_topics


d,e,f = optimize_by_alpha_beta(p_corpus, doc_term_matrix, dictionary, tokenize_text, mininum=3, maximum=8)

<br>
<br>
<br>
Save top 20 word-topic cdistribution

In [None]:
# Put all top 20 words of individual topics into a csv
import os

def save_top_words(directory, best_topics, lda, num_words, print_out = 0):
    
    Top_words={}
    words = lda.show_topics(num_words=20)
    directory = "\\" + directory
    
    for i in range(best_topics):
        
        index = f"Topic{i}"
        Top_words[index] = words[i]
    
    # to csv
    pd.DataFrame(Top_words).to_csv(f"{os.getcwd()}{directory}\Top Words.csv", index=False)
    
    if print_out:
        for i in Top_words:           
            print(Top_words[i])
            print(" ")
        


#save_top_words("RTHK_ntopics", f, d, num_words = 10, print_out = 1)

<br>
<br>
<br>
Result:<br />
Word topic distribution

In [27]:
g = readcsv("RTHK_ntopics\Top Words.csv")
#for topic 1
print("Topic 1 \n",g['Topic0'][1],"\n")
print("Topic 2 \n",g['Topic1'][1],"\n")
print("Topic 3 \n",g['Topic2'][1],"\n")
print("Topic 4 \n",g['Topic3'][1],"\n")
print("Topic 5 \n",g['Topic4'][1],"\n")
print("Topic 6 \n",g['Topic5'][1],"\n")
print("Topic 7 \n",g['Topic6'][1],"\n")

Topic 1 
 0.022*"students" + 0.015*"school" + 0.015*"schools" + 0.011*"education" + 0.009*"teachers" + 0.006*"classes" + 0.006*"secondary" + 0.005*"primary" + 0.005*"teacher" + 0.004*"exams" + 0.004*"children" + 0.004*"yeung" + 0.004*"bureau" + 0.004*"parents" + 0.003*"class" + 0.002*"teaching" + 0.002*"campus" + 0.002*"student" + 0.002*"pupils" + 0.002*"exam" 

Topic 2 
 0.008*"legco" + 0.007*"government" + 0.006*"council" + 0.006*"lawmakers" + 0.006*"chan" + 0.005*"committee" + 0.005*"party" + 0.005*"year" + 0.004*"lawmaker" + 0.004*"lam" + 0.004*"meeting" + 0.004*"camp" + 0.004*"election" + 0.004*"executive" + 0.004*"public" + 0.004*"chief" + 0.003*"new" + 0.003*"years" + 0.003*"prodemocracy" + 0.003*"law" 

Topic 3 
 0.016*"law" + 0.013*"police" + 0.011*"security" + 0.011*"national" + 0.007*"government" + 0.005*"officers" + 0.004*"new" + 0.004*"beijing" + 0.004*"court" + 0.004*"sar" + 0.004*"media" + 0.003*"mainland" + 0.003*"protests" + 0.003*"public" + 0.003*"lam" + 0.003*"statem

<br>
<br>
<br>
Documents which classify as topic1

In [26]:
# Save all document which has been classified 
import os
def save_csv(directory, Topic_corpus, best_topics):
    directory = "\\" + directory
    for i in range(best_topics):
    
        pd.DataFrame(Topic_corpus[i]).to_csv(f"{os.getcwd()}{directory}\Topic{i}.csv", index=False)
        
#save_csv("RTHK_ntopics", e, f)

readcsv("RTHK_ntopics\Topic0.csv")



Unnamed: 0,Title,Corpus
0,'Majority of students want DSE exams postponed',A students' group says an overwhelming majori...
1,Tuen Mun kindergarten closes as kids pull out,Wellcome International Kindergarten in Tuen M...
2,NGO urges govt to help poor with online classes,A survey has found that many grassroots stude...
3,Extended school closures weigh on parents' minds,Parents across Hong Kong are looking ahead to...
4,ESF announces rare freeze in school fees,The English Schools Foundation (ESF) has anno...
5,Govt urged to hold talks over fate of DSE exams,The government is being urged to hold discuss...
6,In the mood for love: park pandas mate at last,There was joy at Hong Kong's shuttered Ocean ...
7,Students plan protest over use of Zoom for les...,A group representing secondary school student...
8,Education chief expects smooth start to DSE exams,The Education Secretary Kevin Yeung says he's...
9,'Consider partial class resumption for all stu...,The chairman of the Hong Kong Association of ...
