## Import libraries

In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset
from collections import Counter
import re

In [3]:
# Load the realnewslike subset of C4
print("Loading C4 realnewslike dataset...")
realnewslike = load_dataset("allenai/c4", "realnewslike", 
                           streaming=True, split="train")
print(realnewslike)

Loading C4 realnewslike dataset...


README.md:   0%|          | 0.00/41.1k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/512 [00:00<?, ?it/s]

IterableDataset({
    features: ['text', 'timestamp', 'url'],
    num_shards: 512
})


In [4]:
# Define common stop words to remove
stop_words = {
    'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 
    'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
    'should', 'may', 'might', 'must', 'can', 'shall', 'this', 'that',
    'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they',
    'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'her', 'its',
    'our', 'their', 'from', 'up', 'about', 'into', 'over', 'after',
    'as', 'so', 'if', 'than', 'when', 'where', 'why', 'how', 'all',
    'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
    'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'than', 'too',
    'very', 'just', 'now'
}

# Initialize word counter
word_counter = Counter()

# Process documents
print("Processing documents...")
for i, example in enumerate(realnewslike):
    # Extract text and convert to lowercase
    text = example["text"].lower()
    
    # Split by whitespace to get words
    words = text.split()
    
    # Filter words: remove stop words and keep only alphabetic words
    filtered_words = [
        word.strip('.,!?";:()[]{}') for word in words 
        if word.strip('.,!?";:()[]{}').isalpha() 
        and word.strip('.,!?";:()[]{}') not in stop_words
        and len(word.strip('.,!?";:()[]{}')) > 2  # Remove very short words
    ]
    
    # Update counter
    word_counter.update(filtered_words)
    
    # Print progress every 1000 documents
    if i % 1000 == 0:
        print(f"Processed {i} documents...")
    
    # Limit processing for demonstration (remove this for full dataset)
    if i == 50000:  # Process 50,000 documents
        break

print(f"\nProcessing complete! Found {len(word_counter)} unique words.")

# Display most common words
print("\nTop 50 most frequent words:")
print("-" * 50)
for word, count in word_counter.most_common(50):
    print(f"{word:<20} {count:>8}")

# Analyze topics by looking at word categories
print("\n" + "="*60)
print("TOPIC ANALYSIS")
print("="*60)

Processing documents...
Processed 0 documents...
Processed 1000 documents...
Processed 2000 documents...
Processed 3000 documents...
Processed 4000 documents...
Processed 5000 documents...
Processed 6000 documents...
Processed 7000 documents...
Processed 8000 documents...
Processed 9000 documents...
Processed 10000 documents...
Processed 11000 documents...
Processed 12000 documents...
Processed 13000 documents...
Processed 14000 documents...
Processed 15000 documents...
Processed 16000 documents...
Processed 17000 documents...
Processed 18000 documents...
Processed 19000 documents...
Processed 20000 documents...
Processed 21000 documents...
Processed 22000 documents...
Processed 23000 documents...
Processed 24000 documents...
Processed 25000 documents...
Processed 26000 documents...
Processed 27000 documents...
Processed 28000 documents...
Processed 29000 documents...
Processed 30000 documents...
Processed 31000 documents...
Processed 32000 documents...
Processed 33000 documents...
Pro

In [5]:
# Technology words
tech_words = ['technology', 'computer', 'internet', 'digital', 'software', 
              'app', 'smartphone', 'online', 'website', 'data', 'ai', 
              'artificial', 'intelligence', 'tech', 'cyber', 'bitcoin']
tech_count = sum(word_counter[word] for word in tech_words)

# Politics words
politics_words = ['government', 'president', 'political', 'election', 'vote',
                  'congress', 'senate', 'democrat', 'republican', 'policy',
                  'law', 'legislation', 'campaign', 'politician']
politics_count = sum(word_counter[word] for word in politics_words)

# Sports words
sports_words = ['game', 'team', 'player', 'season', 'football', 'basketball',
                'baseball', 'soccer', 'sports', 'championship', 'coach',
                'score', 'win', 'league']
sports_count = sum(word_counter[word] for word in sports_words)

# Business/Economy words
business_words = ['business', 'company', 'market', 'economy', 'economic',
                  'financial', 'money', 'price', 'cost', 'profit', 'investment',
                  'stock', 'bank', 'trade', 'industry']
business_count = sum(word_counter[word] for word in business_words)

# Health words
health_words = ['health', 'medical', 'hospital', 'doctor', 'patient',
                'treatment', 'medicine', 'disease', 'virus', 'vaccine',
                'covid', 'pandemic', 'healthcare']
health_count = sum(word_counter[word] for word in health_words)

# Entertainment words  
entertainment_words = ['movie', 'film', 'music', 'show', 'entertainment',
                      'celebrity', 'actor', 'singer', 'hollywood', 'television',
                      'tv', 'series', 'netflix']
entertainment_count = sum(word_counter[word] for word in entertainment_words)

print(f"Technology words total count: {tech_count}")
print(f"Politics words total count: {politics_count}")
print(f"Sports words total count: {sports_count}")
print(f"Business/Economy words total count: {business_count}")
print(f"Health words total count: {health_count}")
print(f"Entertainment words total count: {entertainment_count}")

# Calculate total processed words
total_words = sum(word_counter.values())
print(f"\nTotal words processed: {total_words:,}")

print("\nTopic representation (as percentage of total words):")
print(f"Technology: {(tech_count/total_words)*100:.2f}%")
print(f"Politics: {(politics_count/total_words)*100:.2f}%")
print(f"Sports: {(sports_count/total_words)*100:.2f}%")
print(f"Business/Economy: {(business_count/total_words)*100:.2f}%")
print(f"Health: {(health_count/total_words)*100:.2f}%")
print(f"Entertainment: {(entertainment_count/total_words)*100:.2f}%")

# Look for potentially underrepresented topics
print("\n" + "="*60)
print("OBSERVATIONS ON TOPIC REPRESENTATION")
print("="*60)

underrepresented_words = ['science', 'research', 'education', 'environment',
                         'climate', 'art', 'culture', 'history', 'philosophy',
                         'literature', 'book', 'academic', 'university']

print("Potentially underrepresented topics:")
for word in underrepresented_words:
    count = word_counter[word]
    percentage = (count/total_words)*100 if total_words > 0 else 0
    print(f"{word:<15} {count:>6} ({percentage:.3f}%)")

print("\nNote: This analysis is based on a sample of the dataset.")
print("For complete analysis, remove the iteration limit and process the full dataset.")

Technology words total count: 44615
Politics words total count: 76658
Sports words total count: 67532
Business/Economy words total count: 95290
Health words total count: 27180
Entertainment words total count: 37771

Total words processed: 11,842,240

Topic representation (as percentage of total words):
Technology: 0.38%
Politics: 0.65%
Sports: 0.57%
Business/Economy: 0.80%
Health: 0.23%
Entertainment: 0.32%

OBSERVATIONS ON TOPIC REPRESENTATION
Potentially underrepresented topics:
science           2760 (0.023%)
research          6334 (0.053%)
education         5570 (0.047%)
environment       2614 (0.022%)
climate           2365 (0.020%)
art               3233 (0.027%)
culture           2148 (0.018%)
history           5291 (0.045%)
philosophy         455 (0.004%)
literature         404 (0.003%)
book              4736 (0.040%)
academic           964 (0.008%)
university        8021 (0.068%)

Note: This analysis is based on a sample of the dataset.
For complete analysis, remove the iterat