In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('InquirerScrape.csv')

In [3]:
print('Dataset Shape:', df.shape)
print('Columns:', df.columns.tolist())

Dataset Shape: (17, 7)
Columns: ['Categories', 'Titles', 'Links', 'Date', 'Time', 'Publisher', 'Article']


In [4]:
df.dtypes

Categories    object
Titles        object
Links         object
Date          object
Time          object
Publisher     object
Article       object
dtype: object

In [5]:
df.isnull().sum()

Categories    0
Titles        0
Links         0
Date          1
Time          1
Publisher     0
Article       0
dtype: int64

In [9]:
df['Article_Length'] = df['Article'].str.len()
print('Average Article Length:', df['Article_Length'].mean())
print('Max Article Length:', df['Article_Length'].max())
print('Min Article Length:', df['Article_Length'].min())

Average Article Length: 3003.176470588235
Max Article Length: 7201
Min Article Length: 426


In [10]:
df['Categories'].value_counts()

Categories
globalnation     4
newsinfo         4
entertainment    2
business         2
sports           1
inqfocus         1
Brandroom        1
Property         1
bandera          1
Name: count, dtype: int64

In [11]:
import spacy
from collections import Counter

In [12]:
nlp = spacy.load("en_core_web_sm")

In [15]:
test_article = df['Article'].iloc[0]
test_title = df['Titles'].iloc[0]
test_category = df['Categories'].iloc[0]

In [16]:
print(f"Title: {test_title}")
print(f"Category: {test_category}")
print(f"Article length: {len(test_article)} characters")
print(f"First 200 characters: {test_article[:200]}...")

Title: ICC prosecutor asks court to deny Duterte bid to delay ruling
Category: globalnation
Article length: 3095 characters
First 200 characters:  Former President Rodrigo Duterte at the International Criminal Court. Screengrabbed from the ICC.  MANILA, Philippines — The prosecutor of the International Criminal Court (ICC) has formally asked th...


In [17]:
doc = nlp(test_article)
print(f'Number of entities in the article: {len(doc.ents)}')

Number of entities in the article: 32


In [None]:
entities_by_type = {}

for ent in doc.ents:
    if ent.label_ not in entities_by_type:
        entities_by_type[ent.label_] = []
    entities_by_type[ent.label_].append(ent.text)

for ent_type, entities in entities_by_type.items():
    print(f"{ent_type}: {len(entities)} entities")

    entity_counts = Counter(entities)
    for entity, count in entity_counts.most_common(10):
        print(f"  {entity}: {count} mentions")
