## The Star News Article Scraping

## Install and import required libraries

In [1]:
!pip install newspaper3k
!pip install autoscraper
#!pip install feedparser

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.2-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... 

In [2]:
import pandas as pd
from autoscraper import AutoScraper
from bs4 import BeautifulSoup
import requests

## Find the first link to be pass to Autoscraper

In [3]:
url_to_scrape="https://www.thestar.com.my/news/latest?tag=Nation"

response = requests.get(url_to_scrape)

soup = BeautifulSoup(response.text, 'html.parser')

# Find all <h2> tags with class "f18"
h2_tags = soup.find_all('h2', class_='f18')

for h2_tag in h2_tags:
    # Find the <a> tag within the <h2> tag
    a_tag = h2_tag.find('a')
    if a_tag:
        # Extract the link
        link = a_tag['href']
        
        break

In [4]:
link

'https://www.thestar.com.my/news/nation/2024/05/23/malaysia-hong-kong-agree-to-collaborate-on-tvet-economy-says-zahid'

## Find all the articles' link using Autoscraper

In [5]:
WantedLink=[link]

Scraper = AutoScraper()
url = Scraper.build(url_to_scrape, wanted_list=WantedLink)

In [6]:
%%time

base_url = "https://www.thestar.com.my/news/latest?pgno={}&tag={}"
tags = ["Nation", "Business", "Education", "Aseanplus", "Sport", "Metro", "Tech", "World", "Lifestyle", "Food"]


all_news_urls = []

# Loop through each tag and page number
for tag in tags:
    for page_num in range(1, 11):  # Loop through 10 pages
        url_to_scrape = base_url.format(page_num, tag)
        
        # Get similar result for the current tag and page
        similar_results = Scraper.get_result_similar(url_to_scrape)
        
        all_news_urls.extend(similar_results)


CPU times: user 9.94 s, sys: 184 ms, total: 10.1 s
Wall time: 2min 33s


In [7]:
all_news_urls[:10]

['https://www.thestar.com.my/news/nation/2024/05/23/malaysia-hong-kong-agree-to-collaborate-on-tvet-economy-says-zahid',
 'https://www.thestar.com.my/news/nation/2024/05/23/malaysia---china-relations-039special039-says-zahid-on-50th-diplomatic-anniversary',
 'https://www.thestar.com.my/news/nation/2024/05/23/chinese-tourists-lead-international-visits-to-sabah-in-first-quarter-of-2024',
 'https://www.thestar.com.my/news/nation/2024/05/23/three-nabbed-in-johor-for-stealing-cables',
 'https://www.thestar.com.my/news/nation/2024/05/23/i-filed-police-report-to-stop-ebit-lew-says-witness',
 'https://www.thestar.com.my/news/nation/2024/05/23/spm-2023-candidates-can-check-results-using-sms-education-ministry-website',
 'https://www.thestar.com.my/news/nation/2024/05/23/cops-are-looking-for-missing-12-year-old-girl-in-muar',
 'https://www.thestar.com.my/news/nation/2024/05/23/malaysia-to-leverage-asean-to-tackle-regional-issues-says-anwar',
 'https://www.thestar.com.my/news/nation/2024/05/23/mo

In [8]:
len(all_news_urls)
#paper.__dict__

1644

## Fetch articles' data using Newspaper3k

In [9]:
%%time

import newspaper
from newspaper import Config

config = Config()
config.request_timeout = 30

article_details_list = []
# Loop through each URL
for url in all_news_urls:
    try:
        article = newspaper.Article(url=url, language='en', config=config)
        article.download()
        article.parse()

        meta_data = article.meta_data
        content_id = meta_data.get('content_id', None)
        section_name = meta_data.get('article_section_name', None)
        category = meta_data['cXenseParse'].get('kicker_name', None)
        tags = meta_data.get('content_tags', None)
        summary = meta_data.get('description', None)
        content_agency = meta_data.get('content_agency', None)
        content_tier =  meta_data.get('content_tier', None)
        content_length = meta_data.get('content_length', None)
        authors = meta_data.get('author', None)
        article_detail ={
            "content_id" : content_id,
            "title": str(article.title),
            "text": str(article.text),
            "section" : section_name,
            "category" :category,
            "content_tier":content_tier,
            "content_length":content_length,
            "authors": authors,
            "published_date": str(article.publish_date),
            #"videos": article.movies,
            "keywords": tags,
            "summary": summary,
            "url": url,
            "top_image": str(article.top_image),
        }
        
        article_details_list.append(article_detail)

    except newspaper.ArticleException as e:
        print(f"Failed to process URL: {url}. Error: {e}")

df_star = pd.DataFrame(article_details_list)


CPU times: user 4min 38s, sys: 2.83 s, total: 4min 41s
Wall time: 45min 24s


In [10]:
df_star.head()

Unnamed: 0,content_id,title,text,section,category,content_tier,content_length,authors,published_date,keywords,summary,url,top_image
0,1355430,"Malaysia, Hong Kong agree to collaborate on TV...",HONG KONG: Malaysia and Hong Kong have agreed ...,News,Nation,Complimentary,Medium,Santhakumar a/l Mahalingam,2024-05-23 00:00:00,"Ahmad Zahid,Hong Kong,mutual benefit,TVET,economy",HONG KONG: Malaysia and Hong Kong have agreed ...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
1,1355429,"Malaysia - China relations 'special', says Zah...",HONG KONG: Datuk Seri Dr Ahmad Zahid Hamidi ha...,News,Nation,Complimentary,Short,Santhakumar a/l Mahalingam,2024-05-23 00:00:00,"Ahmad Zahid,Malaysia,China,diplomatic ties,50 ...",HONG KONG: Datuk Seri Dr Ahmad Zahid Hamidi ha...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
2,1355427,Chinese tourists lead international visits to ...,KOTA KINABALU: Sabah has proven to be a favour...,News,Sabah & Sarawak,Complimentary,Short,,2024-05-23 00:00:00,"Sabah & Sarawak,Sabah tourism,Chinese visitors...",KOTA KINABALU: Sabah has proven to be a favori...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
3,1355424,Three nabbed in Johor for stealing cables,KULAI: Three men have been arrested for steali...,News,Nation,Complimentary,Short,,2024-05-23 00:00:00,"Courts Crime,Cable,Theft,Police,Crime",KULAI: Three men have been arrested for steali...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
4,1355392,"I filed police report to stop Ebit Lew, says w...",TENOM: The ninth witness in the sexual harassm...,News,Nation,Complimentary,Medium,Santhakumar a/l Mahalingam,2024-05-23 00:00:00,"Magistrates Court,Tenom,Ebit Lew,sexual harass...",TENOM: The ninth witness in the sexual harassm...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...


## Create new Dataset

In [11]:
prev_df = pd.read_csv("/kaggle/input/news-article-weekly-updated/news_v1.csv")
prev_df.head()

Unnamed: 0,content_id,title,text,section,category,content_tier,content_length,authors,published_date,keywords,summary,url,top_image
0,1350181,Two Filipinos among three individuals arrested...,KUDAT: Two Filipinos were among three individu...,News,Sabah & Sarawak,Complimentary,Short,,2024-05-16 00:00:00,"Sabah & Sarawak,MMEA,Kudat,Arrested,Filipinos,...",KUDAT: Two Filipinos were among three individu...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
1,1350174,Heavy rain in Penang causes flash floods in pa...,PETALING JAYA: Parts of Penang Island are expe...,News,Nation,Complimentary,Short,,2024-05-16 00:00:00,"Flood Alert,Flash Flood,Air Itam,MBPP,Boulder,...",PETALING JAYA: Parts of Penang Island are expe...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
2,1350166,Housemen to be placed at district hospitals fa...,PETALING JAYA: Health Minister Datuk Seri Dr D...,News,Nation,Complimentary,Short,,2024-05-16 00:00:00,"Dzulkefly Ahmad,Housemen,District Hospitals,Do...",,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
3,1350145,Chinese minister Liu Jianchao meets with PKR s...,KUALA LUMPUR: A delegation of senior officials...,News,Nation,Complimentary,Medium,,2024-05-16 00:00:00,"China,Malaysia,PKR,CPC,Nurul,Izzah,Anwar",KUALA LUMPUR: A delegation of senior officials...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
4,1350137,Non-functional wells: Four suspects in Kota Ba...,KOTA BARU: Four suspects who were detained to ...,News,Nation,Complimentary,Short,,2024-05-16 00:00:00,"Courts Crime,MACC,remand,radial well,tube well...",KOTA BARU: Four suspects who were detained to ...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...


In [12]:
df_updated = pd.concat([df_star, prev_df], ignore_index=True)
df_updated.head()

Unnamed: 0,content_id,title,text,section,category,content_tier,content_length,authors,published_date,keywords,summary,url,top_image
0,1355430,"Malaysia, Hong Kong agree to collaborate on TV...",HONG KONG: Malaysia and Hong Kong have agreed ...,News,Nation,Complimentary,Medium,Santhakumar a/l Mahalingam,2024-05-23 00:00:00,"Ahmad Zahid,Hong Kong,mutual benefit,TVET,economy",HONG KONG: Malaysia and Hong Kong have agreed ...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
1,1355429,"Malaysia - China relations 'special', says Zah...",HONG KONG: Datuk Seri Dr Ahmad Zahid Hamidi ha...,News,Nation,Complimentary,Short,Santhakumar a/l Mahalingam,2024-05-23 00:00:00,"Ahmad Zahid,Malaysia,China,diplomatic ties,50 ...",HONG KONG: Datuk Seri Dr Ahmad Zahid Hamidi ha...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
2,1355427,Chinese tourists lead international visits to ...,KOTA KINABALU: Sabah has proven to be a favour...,News,Sabah & Sarawak,Complimentary,Short,,2024-05-23 00:00:00,"Sabah & Sarawak,Sabah tourism,Chinese visitors...",KOTA KINABALU: Sabah has proven to be a favori...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
3,1355424,Three nabbed in Johor for stealing cables,KULAI: Three men have been arrested for steali...,News,Nation,Complimentary,Short,,2024-05-23 00:00:00,"Courts Crime,Cable,Theft,Police,Crime",KULAI: Three men have been arrested for steali...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
4,1355392,"I filed police report to stop Ebit Lew, says w...",TENOM: The ninth witness in the sexual harassm...,News,Nation,Complimentary,Medium,Santhakumar a/l Mahalingam,2024-05-23 00:00:00,"Magistrates Court,Tenom,Ebit Lew,sexual harass...",TENOM: The ninth witness in the sexual harassm...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...


In [13]:
df_updated = df_updated.drop_duplicates(subset='content_id', keep='last')
df_updated

Unnamed: 0,content_id,title,text,section,category,content_tier,content_length,authors,published_date,keywords,summary,url,top_image
3,1355424,Three nabbed in Johor for stealing cables,KULAI: Three men have been arrested for steali...,News,Nation,Complimentary,Short,,2024-05-23 00:00:00,"Courts Crime,Cable,Theft,Police,Crime",KULAI: Three men have been arrested for steali...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
6,1355355,Cops are looking for missing 12-year-old girl ...,MUAR: Police are looking for a 12-year-old gir...,News,Nation,Complimentary,Short,,2024-05-23 00:00:00,"Courts Crime,Missing,Child,Muar,Police",MUAR: Police are looking for a 12-year-old gir...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
8,1355301,Monorail operator gets award from Transport Mi...,KUALA LUMPUR: Monorail operator Ahmad Zahirudd...,News,Nation,Complimentary,Short,,2024-05-23 00:00:00,"Monorail,Transport Ministry,Captain Train,Tree...",KUALA LUMPUR: Monorail operator Ahmad Zahirudd...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
10,1355293,Police warn public not to share video of acid ...,PETALING JAYA: The police are warning the publ...,News,Nation,Complimentary,Short,,2024-05-23 00:00:00,"Courts Crime,Faisal Halim,Viral Video,Acid Att...",PETALING JAYA: The police are warning the publ...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
11,1355291,Tun M says MACC has not asked him to declare a...,PETALING JAYA: Tun Dr Mahathir Mohamad says he...,News,Nation,Complimentary,Short,,2024-05-23 00:00:00,"Mahathir Mohamad,CNDB,MACC,Asset,Declaration,I...",PETALING JAYA: Tun Dr Mahathir Mohamad says he...,https://www.thestar.com.my/news/nation/2024/05...,https://apicms.thestar.com.my/uploads/images/2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17753,1246234,Dazzling holiday feasts that delight,A WONDERFUL time like Christmas calls for an e...,Food,Food News,Complimentary,Medium,,2023-12-22 00:00:00,Food News,A WONDERFUL time like Christmas calls for an e...,https://www.thestar.com.my/food/food-news/2023...,https://apicms.thestar.com.my/uploads/images/2...
17754,1246232,Wishing for a fairy-tale Christmas,"DESSERT for dinner? Why not, when a whimsical ...",Food,Food News,Complimentary,Medium,,2023-12-22 00:00:00,"Food News,Sedap restaurant,buffet,pullman,chri...","DESSERT for dinner? Why not, when a whimsical ...",https://www.thestar.com.my/food/food-news/2023...,https://apicms.thestar.com.my/uploads/images/2...
17755,1246202,Get into the Christmas spirits: Gift ideas for...,There are only a few more sleeps left to Chris...,Lifestyle,Living,Complimentary,Long,Michael Cheang,2023-12-21 00:00:00,"Tipsy Turvy,whisky,cognac,Martell,Singleton",If you are looking for a gift for someone who ...,https://www.thestar.com.my/lifestyle/living/ti...,https://apicms.thestar.com.my/uploads/images/2...
17756,1245369,Hearty meaty affair this Yuletide,MANY have childhood memories that are rooted i...,Metro,Metro News,Complimentary,Medium,,2023-12-21 00:00:00,"food,Christmas,Le meridian KL",MANY have childhood memories that are rooted i...,https://www.thestar.com.my/metro/metro-news/20...,https://apicms.thestar.com.my/uploads/images/2...


In [14]:
df_updated.to_csv("news_v1.csv", index=None)