<a href="https://colab.research.google.com/github/aneeq-shaffy/SE4050-Deep-Learning/blob/main/XLM_RoBERTa_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Preprocessing**

### Importing necessary libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import (
    XLMRobertaTokenizer,
    XLMRobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
from datasets import Dataset
import torch
import warnings
warnings.filterwarnings('ignore')
import json
import requests

Read Dataset

In [5]:
def read_lk_news_automatically(year='2021', num_articles=99999):
    # Get folder list from GitHub
    api_url = f"https://api.github.com/repos/nuuuwan/lk_news/contents/data/lk_news/2020s/{year}?ref=data"
    response = requests.get(api_url)

    if response.status_code != 200:
        print("Error getting folder list")
        return None

    folders = response.json()
    print(f"Found {len(folders)} articles in {year}")

    all_articles = []
    base_url = f"https://raw.githubusercontent.com/nuuuwan/lk_news/data/data/lk_news/2020s/{year}"

    # Download each article
    for i, folder in enumerate(folders[:num_articles]):
        if folder['type'] == 'dir':
            folder_name = folder['name']

            try:
                # Get files
                metadata = requests.get(f"{base_url}/{folder_name}/doc.json").json()
                text = requests.get(f"{base_url}/{folder_name}/doc.txt").text

                # Store article
                all_articles.append({
                    'id': metadata.get('doc_id', ''),
                    'text': text,
                    'title': metadata.get('description', ''),
                    'language': metadata.get('lang', ''),
                    'date': metadata.get('date_str', '')
                })

                if (i+1) % 10 == 0:
                    print(f"Loaded {i+1} articles...")

            except:
                print(f"Failed: {folder_name}")

    return pd.DataFrame(all_articles)

In [6]:
def get_all_news(years=['2021', '2022', '2023', '2024', '2025'], articles_per_year=50):
    all_dfs = []

    for year in years:
        print(f"\nGetting {year}...")
        df_year = read_lk_news_automatically(year, articles_per_year)
        if df_year is not None:
            all_dfs.append(df_year)

    final_df = pd.concat(all_dfs, ignore_index=True)
    print(f"\n✅ Total: {len(final_df)} articles")
    return final_df

In [8]:
# Get the data
df = get_all_news(years=['2021', '2022', '2023', '2024', '2025'], articles_per_year=50)


Getting 2021...
Found 4 articles in 2021

Getting 2022...
Found 19 articles in 2022
Loaded 10 articles...

Getting 2023...
Found 173 articles in 2023
Loaded 10 articles...
Loaded 20 articles...
Loaded 30 articles...
Loaded 40 articles...
Loaded 50 articles...

Getting 2024...
Found 1000 articles in 2024
Loaded 10 articles...
Loaded 20 articles...
Loaded 30 articles...
Loaded 40 articles...
Loaded 50 articles...

Getting 2025...
Found 1000 articles in 2025
Loaded 10 articles...
Loaded 20 articles...
Loaded 30 articles...
Loaded 40 articles...
Loaded 50 articles...

✅ Total: 173 articles


In [9]:
# See what you got
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst article:")
print(df.iloc[0])

Shape: (173, 5)
Columns: ['id', 'text', 'title', 'language', 'date']

First article:
id                                  2021-09-12-adalk-095b8035
text        නොවැම්බර් මස සිට WhatsApp අත්හිටුවන ජංගම දුරකථ...
title       නොවැම්බර් මස සිට WhatsApp අත්හිටුවන ජංගම දුරකථ...
language                                                   si
date                                               2021-09-12
Name: 0, dtype: object


In [10]:
# Save to CSV
df.to_csv('lk_news_data.csv', index=False)
print("💾 Saved to lk_news_data.csv")

💾 Saved to lk_news_data.csv
