In [1]:
# libraries we need
import json
import bz2
import pandas as pd
import orjson
from concurrent.futures import ProcessPoolExecutor

# function for calculating relative frequencies
def rel_freq(x, name):
    # removing null values
    valid_languages = [value for value in x if value is not None]

    # checking if there are any valid languages
    if len(valid_languages) == 0:
        # skip if none
        print(f"No valid language data found in {name}, skipping...")
        return  
    
    # count the number of english pages
    en_count = sum(1 for value in valid_languages if value == 'en')
    
    # if english pages are more than 70%
    if en_count / len(valid_languages) > 0.70:
        try:
            # open engtexts.txt and add the file id
            with open("engtexts.txt", "a") as myfile:
                myfile.write(name[34:] + "\n")
        except IOError as e:
            print(f"Error writing to engtexts.txt: {e}")

# process file function
def process_file(name):
    try:
        # read in file
        with bz2.BZ2File(name, 'rb') as input_file:
            
            # decode
            input_file_content = input_file.read()
            json_input = input_file_content.decode('utf-8')
            
            # json
            data = orjson.loads(json_input)

            # extract features field
            features = data.get('features', {})

            # check if pages exists and if its a list
            if 'pages' in features and isinstance(features['pages'], list):
                calculated_languages = [
                    page.get('calculatedLanguage')
                    for page in features['pages']
                    if page.get('calculatedLanguage') is not None
                ]
                
                # call rel_freq defined above
                rel_freq(calculated_languages, name)
    
    except Exception as e:
        
        print(f"Error processing {name}: {e}")

#pull in list of file_names and paths
file_path = 'file_listing.txt'

# open it
with open(file_path, 'r') as file:
    #readlines
    path = file.read().splitlines()

# load all the ids we've already confirmed as being written in english
english_texts = 'engtexts.txt'

# open
with open(english_texts, 'r') as file:
    
    # read lines
    english_texts = file.read().splitlines()

# find the position of the last english_text in the list of paths
index = path.index(english_texts[-1])

# remove everything before this point from the list of paths
path = path[index:len(path)]

# add the actual folder locations to the start of each id
prefix = '../../Volumes/My Passport for Mac/'

# add the prefix
paths = [prefix + filename for filename in path]

# create empty variable calculated languages
calculated_languages = []

# for every name of jsons we have yet to load
for name in paths:
    calculated_languages = [] 
    
    #load zip file
    with bz2.BZ2File(name) as input_file:
        input_file_content = input_file.read()
        json_input = input_file_content.decode('utf-8')
        data = orjson.loads(json_input)

        #load the features
        features = data.get('features', {})
        if 'pages' in features and isinstance(features['pages'], list):
            for page in features['pages']:
                calculated_language = page.get('calculatedLanguage')
                if calculated_language:
                    calculated_languages.append(calculated_language)
                    
    #call relative frequency calculation for current file
    rel_freq(calculated_languages, name)

No valid language data found in ../../Volumes/My Passport for Mac/chi/066/chi.086651677.json.bz2, skipping...
No valid language data found in ../../Volumes/My Passport for Mac/chi/079/chi.086713990.json.bz2, skipping...
No valid language data found in ../../Volumes/My Passport for Mac/chi/082/chi.086828290.json.bz2, skipping...
No valid language data found in ../../Volumes/My Passport for Mac/chi/083/chi.085827364.json.bz2, skipping...
No valid language data found in ../../Volumes/My Passport for Mac/chi/084/chi.084894473.json.bz2, skipping...


[]

['en',
 'zh',
 'de',
 'an',
 'gl',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',
 'es',

In [5]:
calculated_languages

['ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',
 'ar',