In [1]:
from langdetect import detect

text = "This is an English sentence."
language = detect(text)  # Returns 'en' for English
print(language)

en


In [4]:
import os
from langdetect import detect
from langdetect import LangDetectException

def check_english_files(folder_path, limit=100):
    """
    Reads the first 50 text files in a folder and checks if they're in English.
    
    Args:
        folder_path (str): Path to the folder containing text files
        limit (int): Maximum number of files to check
        
    Returns:
        dict: Dictionary with filenames as keys and language detection results as values
    """
    results = {}
    count = 0
    
    # Check if the directory exists
    if not os.path.isdir(folder_path):
        print(f"Error: Directory '{folder_path}' does not exist.")
        return results
    
    # Get all files in the directory
    all_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    
    # Filter for text files (assuming files with .txt extension are text files)
    text_files = [f for f in all_files if f.lower().endswith('.txt')]
    
    # Process the files up to the limit
    for filename in text_files[:limit]:
        file_path = os.path.join(folder_path, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                # Read the content
                content = file.read().strip()
                
                # Skip empty files
                if not content:
                    results[filename] = {"is_english": False, "language": "unknown", "error": "Empty file"}
                    continue
                
                # Detect language
                detected_lang = detect(content)
                is_english = detected_lang == 'en'
                
                results[filename] = {
                    "is_english": is_english,
                    "language": detected_lang,
                    "error": None
                }
        
        except LangDetectException as e:
            results[filename] = {"is_english": False, "language": "unknown", "error": f"Language detection error: {str(e)}"}
        except Exception as e:
            results[filename] = {"is_english": False, "language": "unknown", "error": f"Error processing file: {str(e)}"}
        
        count += 1
        if count >= limit:
            break
    
    return results

def print_summary(results):
    """Print a summary of the language detection results"""
    total = len(results)
    english_count = sum(1 for file_result in results.values() if file_result["is_english"])
    non_english_count = total - english_count
    
    print(f"\nSummary:")
    print(f"Total files processed: {total}")
    print(f"English files: {english_count}")
    print(f"Non-English files: {non_english_count}")
    
    if non_english_count > 0:
        print("\nNon-English files:")
        for filename, result in results.items():
            if not result["is_english"]:
                lang = result["language"]
                error = result["error"] or "None"
                print(f"- {filename}: {lang}")

if __name__ == "__main__":
    # Replace this with your folder path
    folder_path = "path/to/your/text/files"
    
    print(f"Checking English language in the first 50 text files in '{folder_path}'...")
    results = check_english_files('03_bonus_pretraining_on_gutenberg/gutenberg/data/raw')
    
    print_summary(results)

Checking English language in the first 50 text files in 'path/to/your/text/files'...

Summary:
Total files processed: 100
English files: 79
Non-English files: 21

Non-English files:
- PG53693_raw.txt: fi
- PG55293_raw.txt: fi
- PG54353_raw.txt: de
- PG17996_raw.txt: el
- PG41116_raw.txt: fr
- PG38548_raw.txt: fr
- PG66035_raw.txt: fr
- PG51279_raw.txt: fi
- PG30703_raw.txt: fr
- PG51386_raw.txt: de
- PG52880_raw.txt: fr
- PG42321_raw.txt: es
- PG41287_raw.txt: nl
- PG48676_raw.txt: sv
- PG60578_raw.txt: fi
- PG44940_raw.txt: nl
- PG47101_raw.txt: fi
- PG44856_raw.txt: fi
- PG49149_raw.txt: es
- PG42116_raw.txt: fi
- PG65073_raw.txt: it
