In [1]:
import pandas as pd
from ftfy import fix_text

# Note: article_id 1025 , article text is too large which results in taking up multiple rows of the csv file
# It just make the csv ugly, because out of no where multiple rows text appear, you can just drop the article 1025 yourself if you want the csv look nice nice
df_articles_content = pd.read_csv("articles_content.csv")
df_articles_urls = pd.read_csv("articles_url.csv")

# Setting the column Types 
articles_ids = df_articles_content["article_id"].astype(int)

# Sort article_id 
df_articles_sort_by_id = df_articles_content.sort_values(by='article_id')

# Remove duplicate article_id
df_sorted_by_id = df_articles_sort_by_id.drop_duplicates(subset=['article_id'], keep="first")

print(len(df_sorted_by_id))
print(len(df_articles_urls))

8544
8544


In [2]:
# Check what are the rows that are empty, and duplicated rows
df_sorted_by_id.info()
# df.dtypes

<class 'pandas.core.frame.DataFrame'>
Index: 8544 entries, 0 to 8575
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   article_id                 8544 non-null   int64 
 1   article_headline           8544 non-null   object
 2   article_short_description  6479 non-null   object
 3   article_text               8529 non-null   object
 4   article_url                8544 non-null   object
 5   article_author_name        7605 non-null   object
 6   article_author_link        7605 non-null   object
 7   article_datetime_released  8532 non-null   object
 8   datetime_crawled           8544 non-null   object
dtypes: int64(1), object(8)
memory usage: 667.5+ KB


In [3]:
expected_ids = set(range(1, 8545))  

# Get the actual IDs present in the dataset
actual_ids = set(articles_ids)

# Find missing IDs
missing_ids = sorted(expected_ids - actual_ids)

print(f"Total Missing IDs: {len(missing_ids)}")
print("Missing Article IDs:", missing_ids)

Total Missing IDs: 0
Missing Article IDs: []


In [4]:
# Find missing or empty article_text entries
missing_text_articles = df_sorted_by_id[df_sorted_by_id["article_text"].isna() | (df_sorted_by_id["article_text"].str.strip() == "")]

# Display the missing article IDs
if not missing_text_articles.empty:
    print("Missing Article Text Entries:")
    print(missing_text_articles[["article_id", "article_headline"]])
    
    # Save the missing IDs to a CSV file for reference
    missing_text_articles[["article_id", "article_headline","article_url"]].to_csv("missing_article_text.csv", index=False)
    print("Saved missing article IDs to 'missing_article_text.csv'")
else:
    print("✅ No missing article_text found!")

Missing Article Text Entries:
      article_id                                   article_headline
3860        1701  Singapore Turf Club: A lookback, through the a...
4640        1848  As it happened: MRT East-West line disruption,...
4922        1905  In pictures: Action from the 2024 Formula 1 Si...
1410        2596  NDP 2024: The National Day Parade that rain co...
3281        3101  NDP 2024: Behind the scenes at the parade | In...
7479        3959  Lee Hsien Loong: 20 years as Prime Minister an...
8172        5459  Indonesia, Singapore sign outline pledge on ca...
2777        5984  The dad who killed his twin sons: Investigatin...
5209        6521  A deep dive into COE prices in Singapore: Does...
1141        7062  Dyslexia: What it’s like and how to cope | Int...
3478        7469  Women and pregnancy: What to know if you opt f...
4883        7740  Interactive: The LKY100 coin - How the commemo...
4889        7741  Interactive: The LKY100 coin - How the commemo...
4980        7759  

In [5]:
# Drop empty article_text row or rows
df_sorted_by_id = df_sorted_by_id.dropna(subset=["article_text"])  # Drops NaN values
df_sorted_by_id = df_sorted_by_id[df_sorted_by_id["article_text"].str.strip() != ""]  # Drops empty text

# From 8544 rows to 8529 rows
df_sorted_by_id.info()

# Remove the row where article_id is 1025
# df_sorted_by_id = df_sorted_by_id[df_sorted_by_id["article_id"] != 1025]

# df_sorted_by_id.to_csv("test.csv")

<class 'pandas.core.frame.DataFrame'>
Index: 8529 entries, 0 to 8575
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   article_id                 8529 non-null   int64 
 1   article_headline           8529 non-null   object
 2   article_short_description  6477 non-null   object
 3   article_text               8529 non-null   object
 4   article_url                8529 non-null   object
 5   article_author_name        7601 non-null   object
 6   article_author_link        7601 non-null   object
 7   article_datetime_released  8529 non-null   object
 8   datetime_crawled           8529 non-null   object
dtypes: int64(1), object(8)
memory usage: 666.3+ KB


In [6]:
df_sorted_by_id["article_short_description"].fillna("No Short Description", inplace=True)
df_sorted_by_id["article_text"].fillna("Different CSS Class, Thus No Article Text", inplace=True)
df_sorted_by_id["article_author_name"].fillna("No Author Name", inplace=True)
df_sorted_by_id["article_author_link"].fillna("No Author Link", inplace=True)
df_sorted_by_id["article_datetime_released"].fillna("No Date Released", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_sorted_by_id["article_short_description"].fillna("No Short Description", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_sorted_by_id["article_text"].fillna("Different CSS Class, Thus No Article Text", inplace=True)
The behavior will change in pandas 3.0. This

In [7]:
# Alright, Now all the values all being filled
# Time to remove unwanted text and werid characters from the article_text
df_sorted_by_id.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8529 entries, 0 to 8575
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   article_id                 8529 non-null   int64 
 1   article_headline           8529 non-null   object
 2   article_short_description  8529 non-null   object
 3   article_text               8529 non-null   object
 4   article_url                8529 non-null   object
 5   article_author_name        8529 non-null   object
 6   article_author_link        8529 non-null   object
 7   article_datetime_released  8529 non-null   object
 8   datetime_crawled           8529 non-null   object
dtypes: int64(1), object(8)
memory usage: 666.3+ KB


In [9]:
# Define unwanted text patterns
unwanted_texts = ["SINGAPORE â€”", "SINGAPORE:","ADVERTISEMENT"]

# Function to clean the text
def clean_article_text(text):
    text = fix_text(text)
    if isinstance(text, str):  # Ensure text is a string before replacing
        for unwanted in unwanted_texts:
            text = text.replace(unwanted, "").strip()
    return text

# Apply the cleaning function to the 'article_text' column
df_sorted_by_id["article_text"] = df_sorted_by_id["article_text"].apply(clean_article_text)
df_sorted_by_id["article_headline"] = df_sorted_by_id["article_headline"].apply(clean_article_text)
df_sorted_by_id["article_short_description"] = df_sorted_by_id["article_short_description"].apply(clean_article_text)

# Save the cleaned data to a new CSV file
cleaned_file_path = "articles_content_cleaned.csv"
df_sorted_by_id.to_csv("articles_content_cleaned.csv", index=False, encoding="utf-8-sig")

print(f"Cleaned file saved as: {cleaned_file_path}")

Cleaned file saved as: articles_content_cleaned.csv
