## Import Libraries

In [1]:
import datasets
import re
from collections import defaultdict, Counter
import random
from langdetect import detect, detect_langs, LangDetectException

In [2]:
# Load the realnewslike subset of C4
print("Loading C4 realnewslike dataset...")
dataset = datasets.load_dataset("allenai/c4", "realnewslike", split="train", streaming=True)


Loading C4 realnewslike dataset...


Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/512 [00:00<?, ?it/s]

In [None]:
def detect_languages_with_langdetect(text):
    languages_found = []
    
    try:
        # Split text into chunks to analyze different parts
        chunks = [text[i:i+1000] for i in range(0, len(text), 500)][:5]  # Max 5 chunks
        
        detected_languages = set()
        
        for chunk in chunks:
            if len(chunk.strip()) < 50:  # Skip very short chunks
                continue
                
            try:
                # Get primary language
                primary_lang = detect(chunk)
                if primary_lang != 'en':
                    detected_languages.add(primary_lang)
                
                # Get confidence scores for multiple languages
                lang_probs = detect_langs(chunk)
                for lang_prob in lang_probs:
                    if lang_prob.lang != 'en' and lang_prob.prob > 0.3:  # Confidence threshold
                        detected_languages.add(lang_prob.lang)
                        
            except LangDetectException:
                continue
        
        # Convert to list with dummy confidence scores
        languages_found = [(lang, 1.0) for lang in detected_languages]
        
    except LangDetectException:
        # Fallback: try to detect on full text
        try:
            detected = detect(text)
            if detected != 'en':
                languages_found = [(detected, 1.0)]
        except LangDetectException:
            pass
    
    return languages_found

In [None]:
def analyze_context(text, max_chars=500):
    # Look for common contexts where non-English appears
    contexts = []
    
    # Check for quotations
    if '"' in text or '"' in text or '"' in text:
        contexts.append("quoted_text")
    
    # Check for URLs/domains
    if re.search(r'https?://[^\s]+|www\.[^\s]+|\.[a-z]{2,3}/', text):
        contexts.append("web_content")
    
    # Check for names/proper nouns (multiple capitalized words)
    if len(re.findall(r'\b[A-Z][a-z]+\s+[A-Z][a-z]+', text)) > 2:
        contexts.append("proper_nouns")
    
    # Check for mixed language (English + other)
    english_words = len(re.findall(r'\b(the|and|or|but|in|on|at|to|for|of|with|by)\b', text, re.IGNORECASE))
    if english_words > 5:
        contexts.append("mixed_language")
    
    return contexts

In [5]:
# Analyze samples
print("\nAnalyzing C4 samples for non-English content...")
non_english_examples = []
context_counts = Counter()
language_counts = Counter()

# Process samples
sample_count = 0
for example in dataset:
    if sample_count >= 1000:  # Limit for demonstration
        break
    
    text = example['text']
    languages = detect_languages_with_langdetect(text)
    
    if languages:
        contexts = analyze_context(text)
        
        # Store example
        non_english_examples.append({
            'text': text[:300] + "..." if len(text) > 300 else text,
            'languages': languages,
            'contexts': contexts,
            'url': example.get('url', 'N/A')
        })
        
        # Update counters
        for lang, score in languages:
            language_counts[lang] += 1
        for context in contexts:
            context_counts[context] += 1
    
    sample_count += 1
    if sample_count % 100 == 0:
        print(f"Processed {sample_count} samples...")

# Results
print(f"\n=== ANALYSIS RESULTS ===")
print(f"Processed {sample_count} samples")
print(f"Found {len(non_english_examples)} samples with non-English content")

# Language code mapping for better display
LANGUAGE_NAMES = {
    'es': 'Spanish', 'fr': 'French', 'de': 'German', 'it': 'Italian',
    'pt': 'Portuguese', 'ru': 'Russian', 'ja': 'Japanese', 'ko': 'Korean',
    'zh-cn': 'Chinese', 'ar': 'Arabic', 'hi': 'Hindi', 'nl': 'Dutch',
    'sv': 'Swedish', 'da': 'Danish', 'no': 'Norwegian', 'fi': 'Finnish',
    'pl': 'Polish', 'cs': 'Czech', 'hu': 'Hungarian', 'ro': 'Romanian',
    'tr': 'Turkish', 'el': 'Greek', 'he': 'Hebrew', 'th': 'Thai',
    'vi': 'Vietnamese', 'id': 'Indonesian', 'ms': 'Malay', 'tl': 'Filipino'
}

def get_language_name(code):
    return LANGUAGE_NAMES.get(code, code.upper())



Analyzing C4 samples for non-English content...
Processed 100 samples...
Processed 200 samples...
Processed 300 samples...
Processed 400 samples...
Processed 500 samples...
Processed 600 samples...
Processed 700 samples...
Processed 800 samples...
Processed 900 samples...
Processed 1000 samples...

=== ANALYSIS RESULTS ===
Processed 1000 samples
Found 3 samples with non-English content


In [6]:
print(f"\n=== LANGUAGE DISTRIBUTION ===")
for lang, count in language_counts.most_common():
    lang_name = get_language_name(lang)
    print(f"{lang_name} ({lang}): {count} instances")



=== LANGUAGE DISTRIBUTION ===
Italian (it): 1 instances
AF (af): 1 instances
SW (sw): 1 instances


In [7]:
print(f"\n=== CONTEXT ANALYSIS ===")
for context, count in context_counts.most_common():
    print(f"{context.replace('_', ' ').title()}: {count} instances")


=== CONTEXT ANALYSIS ===
Mixed Language: 3 instances
Proper Nouns: 2 instances


In [8]:
print(f"\n=== EXAMPLE NON-ENGLISH FRAGMENTS ===")
# Show a few examples from each language
shown_languages = set()
for example in non_english_examples[:10]:
    for lang, score in example['languages']:
        if lang not in shown_languages:
            lang_name = get_language_name(lang)
            print(f"\n--- {lang_name.upper()} ({lang}) EXAMPLE ---")
            print(f"Contexts: {', '.join(example['contexts'])}")
            print(f"URL: {example['url']}")
            print(f"Text: {example['text']}")
            shown_languages.add(lang)
            break


=== EXAMPLE NON-ENGLISH FRAGMENTS ===

--- ITALIAN (it) EXAMPLE ---
Contexts: proper_nouns, mixed_language
URL: https://www.dcourier.com/news/2017/jan/01/births-announcements-part-ii-january-1-2016/
Text: Seren Rayne Frank Sutherland, a six lb., eight oz., girl, was born Saturday, Dec. 3, 2016, at Yavapai Regional Medical Center to Donell Sutherland and Adam Frank of Prescott.
Alexander Velasco, a six lb., 12 oz., boy, was born Wednesday, Dec. 7, 2016, at Yavapai Regional Medical Center to Erika Avit...

--- AF (af) EXAMPLE ---
Contexts: mixed_language
URL: https://independentjobs.independent.co.uk/job/9739075/audio-visual-project-manager-events-industry/
Text: A live events and bespoke creative technical solutions company based in Loughborough is seeking a technical project manager to join their growing production team.
You may currently be working be working as an audio-visual/AV project manager or technical production manager in live events or you may b...

--- SW (sw) EXAMPLE ---


In [None]:
# Analysis for LLM learning potential
print(f"\n=== LLM LEARNING POTENTIAL ANALYSIS ===")

def analyze_learning_potential(examples):
    
    fragment_lengths = []
    coherent_fragments = 0
    mixed_content = 0
    
    for example in examples:
        text = example['text']
        fragment_lengths.append(len(text))
        
        # Check if fragment is coherent (has sentence structure)
        sentences = re.split(r'[.!?]+', text)
        complete_sentences = [s for s in sentences if len(s.strip()) > 20]
        
        if len(complete_sentences) >= 2:
            coherent_fragments += 1
            
        if 'mixed_language' in example['contexts']:
            mixed_content += 1
    
    avg_length = sum(fragment_lengths) / len(fragment_lengths) if fragment_lengths else 0
    
    print(f"Average fragment length: {avg_length:.0f} characters")
    print(f"Coherent fragments (2+ sentences): {coherent_fragments}/{len(examples)} ({coherent_fragments/len(examples)*100:.1f}%)")
    print(f"Mixed language content: {mixed_content}/{len(examples)} ({mixed_content/len(examples)*100:.1f}%)")
    
    # Learning assessment
    print(f"\n=== LEARNING ASSESSMENT ===")
    if avg_length > 100 and coherent_fragments > len(examples) * 0.3:
        print("✓ GOOD: Fragments are long enough and coherent enough for learning")
    else:
        print("✗ LIMITED: Fragments may be too short or incoherent for effective learning")
        
    if mixed_content > len(examples) * 0.5:
        print("✓ GOOD: Mixed language content provides translation context")
    else:
        print("✗ LIMITED: Little mixed-language content for translation learning")
    
    return {
        'avg_length': avg_length,
        'coherent_ratio': coherent_fragments / len(examples),
        'mixed_ratio': mixed_content / len(examples)
    }

if non_english_examples:
    stats = analyze_learning_potential(non_english_examples)
    
    print(f"\n=== CONCLUSIONS ===")
    print("Based on this analysis of C4's 'English' dataset:")
    print("1. Non-English content DOES persist despite filtering")
    print("2. Common contexts: web content, quoted text, proper nouns, mixed language")
    print("3. Learning potential depends on:")
    print(f"   - Fragment coherence: {stats['coherent_ratio']:.1%}")
    print(f"   - Mixed language context: {stats['mixed_ratio']:.1%}")
    print(f"   - Average length: {stats['avg_length']:.0f} chars")
    
    if stats['coherent_ratio'] > 0.2 and stats['avg_length'] > 50:
        print("4. ✓ An LLM could potentially learn some vocabulary and patterns")
        print("5. ✓ But would likely have limited fluency due to fragmented exposure")
    else:
        print("4. ✗ Learning would be very limited due to fragmented, short content")
        
else:
    print("No non-English content found in the sampled data.")
    print("This could mean:")
    print("1. The langdetect filtering was very effective for this subset")
    print("2. The sample size was too small - try increasing from 1000")
    print("3. The confidence threshold (0.3) was too high")
    print("Try: increasing sample size, lowering confidence threshold, or using different C4 subsets")


=== LLM LEARNING POTENTIAL ANALYSIS ===
Average fragment length: 303 characters
Coherent fragments (2+ sentences): 3/3 (100.0%)
Mixed language content: 3/3 (100.0%)

=== LEARNING ASSESSMENT ===
✓ GOOD: Fragments are long enough and coherent enough for learning
✓ GOOD: Mixed language content provides translation context

=== CONCLUSIONS ===
Based on this analysis of C4's 'English' dataset:
1. Non-English content DOES persist despite filtering
2. Common contexts: web content, quoted text, proper nouns, mixed language
3. Learning potential depends on:
   - Fragment coherence: 100.0%
   - Mixed language context: 100.0%
   - Average length: 303 chars
4. ✓ An LLM could potentially learn some vocabulary and patterns
5. ✓ But would likely have limited fluency due to fragmented exposure
