In [2]:
# Install required packages in the notebook environment
%pip install rdflib scikit-learn seaborn matplotlib

Collecting rdflib
  Using cached rdflib-7.5.0-py3-none-any.whl.metadata (12 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     -------------------- ------------------- 30.7/60.8 kB 1.4 MB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 815.7 kB/s eta 0:00:00
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached rdflib-7.5.0-py3-none-any.whl (587 kB)
Downloading scikit_learn-1.7.2-cp311-cp311-win_amd64.whl (8.9 MB)
   ---------------------------------------- 0.0/8.9 MB ? eta -:--:--
    ------


[notice] A new release of pip is available: 23.3.2 -> 25.3
[notice] To update, run: C:\Users\andre\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [5]:
from rdflib import Graph, Namespace
import pandas as pd

def load_news_data_from_rdf_xml(filename):
    """
    Load and extract news data from RDF/XML ontology file
    Returns: pandas DataFrame with news article data
    """
    # Load RDF data using rdflib
    g = Graph()
    g.parse(filename, format='xml')

    # Define the news namespace
    NEWS = Namespace("http://www.example.org/news#")

    # Prepare lists to store the extracted data
    article_ids = []
    categories = []
    headlines = []
    descriptions = []
    places = []

    # SPARQL query to extract all news article details
    query = """
    SELECT ?article ?category ?headline ?description ?place
    WHERE {
        ?article news:category ?category .
        ?article news:headline ?headline .
        ?article news:short_description ?description .
        ?article news:place ?place .
    }
    """

    # Execute the query
    results = g.query(query, initNs={'news': NEWS})

    # Process query results
    for row in results:
        article_id = str(row.article).split("/")[-1]  # Extract article ID from URI
        article_ids.append(article_id)
        categories.append(str(row.category))
        headlines.append(str(row.headline))
        descriptions.append(str(row.description))
        places.append(str(row.place))

    # Create DataFrame
    df = pd.DataFrame({
        'article_id': article_ids,
        'category': categories,
        'headline': headlines,
        'short_description': descriptions,
        'place': places
    })

    return df

# Load the news data from the correct file path
print("Loading news data from RDF/XML ontology...")
news_df = load_news_data_from_rdf_xml('../News_Categorizer_RDF.xml')

print(f"Successfully loaded {len(news_df)} news articles!")
print(f"\nDataset shape: {news_df.shape}")
print(f"\nFirst 5 articles:")
print(news_df.head())

print(f"\nCategories found:")
print(news_df['category'].value_counts())

news_df.to_csv('parsed_news_data.csv', index=False)

Loading news data from RDF/XML ontology...
Successfully loaded 9999 news articles!

Dataset shape: (9999, 5)

First 5 articles:
  article_id  category                                           headline  \
0   Article1  WELLNESS              143 Miles in 35 Days: Lessons Learned   
1   Article2  WELLNESS       Talking to Yourself: Crazy or Crazy Helpful?   
2   Article3  WELLNESS  Crenezumab: Trial Will Gauge Whether Alzheimer...   
3   Article4  WELLNESS                     Oh, What a Difference She Made   
4   Article5  WELLNESS                                   Green Superfoods   

                                   short_description     place  
0  Resting is part of training. I've confirmed wh...  Torrance  
1  Think of talking to yourself as a tool to coac...   Norwalk  
2  The clock is ticking for the United States to ...   Norwalk  
3  If you want to be busy, keep trying to be perf...   Norwalk  
4  First, the bad news: Soda bread, corned beef a...   Norwalk  

Categories found:
