In [2]:
#load the libraries
import json
import bz2
import pandas as pd
import orjson

#this function calculates relative frequencies
def rel_freq(x, name):
    #filter out None values
    valid_languages = [value for value in x if value is not None]

    #check if there are any valid languages, skip if there isn't
    if len(valid_languages) == 0:
        print(f"No valid language data found in {name}, skipping...")
        return
    
    #count the number of English pages
    en_count = sum(1 for value in valid_languages if value == 'en')
    
    #calculate the percentage of English pages
    if en_count / len(valid_languages) > 0.70:
        try:
            with open("engtexts.txt", "a") as myfile:
                myfile.write(name[34:] + "\n")
        except IOError as e:
            print(f"Error writing to engtexts.txt: {e}")

#function to process a single file
def process_file(name):
    try:
        #read in the compressed json
        with bz2.BZ2File(name, 'rb') as input_file:
            #read and decode
            input_file_content = input_file.read()
            json_input = input_file_content.decode('utf-8')
            
            #load the JSON
            data = orjson.loads(json_input)

            #extract the 'features' field from the JSON
            features = data.get('features', {})

            #check if pages exists and its a list
            if 'pages' in features and isinstance(features['pages'], list):
                #if both apply, create calculated languages, which gets the calculated languages
                calculated_languages = [
                    page.get('calculatedLanguage')
                    for page in features['pages']
                    if page.get('calculatedLanguage') is not None
                ]
                
                #call rel_freq to check if more than 70% of the text is in English
                rel_freq(calculated_languages, name)
    except Exception as e:
        print(f"Error processing {name}: {e}")

#pull in the list of filenames + paths
file_path = 'file_listing.txt'

#open it
with open(file_path, 'r') as file:
    #readlines
    path = file.read().splitlines()

#load all the ids we've already confirmed as being written in english
english_texts = 'engtexts.txt'

#open
with open(english_texts, 'r') as file:
    
    #read lines
    english_texts = file.read().splitlines()

#find the position of the last english_text in the list of paths
index = path.index(english_texts[-1])

#remove everything before this point from the list of paths
path = path[index:len(path)]

#add the actual folder locations to the start of each id
prefix = '../../Volumes/My Passport for Mac/'

#add the prefix
paths = [prefix + filename for filename in path]

#create empty variable calculated languages
calculated_languages = []

#for every name of jsons we have yet to load
for name in paths:
    calculated_languages = [] 
    
    #load zip file
    with bz2.BZ2File(name) as input_file:
        input_file_content = input_file.read()
        json_input = input_file_content.decode('utf-8')
        data = orjson.loads(json_input)

        #load the features
        features = data.get('features', {})
        if 'pages' in features and isinstance(features['pages'], list):
            for page in features['pages']:
                calculated_language = page.get('calculatedLanguage')
                if calculated_language:
                    calculated_languages.append(calculated_language)
                    
    #call relative frequency calculation for current file
    rel_freq(calculated_languages, name)

No valid language data found in ../../Volumes/My Passport for Mac/wu/8071/wu.89105874911.json.bz2, skipping...
No valid language data found in ../../Volumes/My Passport for Mac/wu/8072/wu.89105874929.json.bz2, skipping...
No valid language data found in ../../Volumes/My Passport for Mac/wu/8073/wu.89105874531.json.bz2, skipping...
No valid language data found in ../../Volumes/My Passport for Mac/wu/8074/wu.89105874945.json.bz2, skipping...
No valid language data found in ../../Volumes/My Passport for Mac/wu/8076/wu.89105874267.json.bz2, skipping...
No valid language data found in ../../Volumes/My Passport for Mac/wu/8077/wu.89105874275.json.bz2, skipping...
No valid language data found in ../../Volumes/My Passport for Mac/wu/8077/wu.89105874473.json.bz2, skipping...
No valid language data found in ../../Volumes/My Passport for Mac/wu/8078/wu.89105874580.json.bz2, skipping...
No valid language data found in ../../Volumes/My Passport for Mac/wu/8078/wu.89105874689.json.bz2, skipping...
N