In [None]:
# %pip install pandas

In [None]:
import pandas as pd
from pathlib import Path

ROOT_PATH = Path.home() / "SEED_DATA/impact_and_fiction/"

work_isbn_genre = ROOT_PATH / "work_isbn_genre.tsv"
df = pd.read_csv(work_isbn_genre, sep="\t")
df

## Auxiliary Functions

In [None]:
import os
import gzip

def get_isbn_from_filename(filename: str):
    return filename.split('_')[1].split('-')[0]

def get_isbn_from_filenames(filenames: list[str]):
    return [get_isbn_from_filename(x) for x in filenames]

def get_book_content(dir_name, file_name, idx):
    all_words = []
    print(f"{idx} unzipping: {file_name}")
    for idxx,line in enumerate(gzip.open(os.path.join(dir_name, file_name),'rt')):
        words = [word.lower() for word in line.split()]
        all_words.extend(words)
    return all_words

## Get All Valid Files in novels_tokens folder

In [None]:
import json

novels_tokens = ROOT_PATH / "novels_tokens"

limit = -1
token_data = []
for idx,file_name in enumerate(os.listdir(novels_tokens)):
    book_tokens = [] # get_book_content(novels_tokens, file_name, idx)
    token_data.append((get_isbn_from_filename(file_name), file_name, len(book_tokens)))
    if idx == limit: break

token_df = pd.DataFrame(token_data, columns=['isbn','book_filename', 'token_count'])
token_df = token_df.set_index('book_filename')

filename2isbn = token_df.to_dict(orient='index')
json.dump(filename2isbn, open("data/filename2isbn.json", "w", encoding="utf-8"), indent=2)

token_df

## Get the definitive lists of unique ISBNs (Partitioned in Fiction vs Non-Fiction)

In [None]:
def get_unique_isbns_df(filename, skip_files = None):
    volatility = pd.read_csv(filename)
    volatility['isbn'] = [i.split('_')[1] for i in volatility.book]
    if skip_files:
        isbns_only = [x.split('_')[1].split('-')[0] for x in skip_files]
        with open("data/non_fiction_isbns.txt", "w") as f:
            [f.write(x+"\n") for x in isbns_only]
        volatility = volatility[~volatility.isbn.isin(isbns_only)]
    volatility = volatility.drop(columns=['book'])
    volatility = volatility.set_index('isbn')
    return volatility

skip_files = open("non_fiction_filenames.txt").read().splitlines()

# Any volatility output file should work. The total number of unique books is 18,467. The completely non-fiction books are 5,257
filename = "/Users/jose/SEED_DATA/impact_and_fiction/volatilities/book_moors_volatility_300_few_words_normal.csv"
df = get_unique_isbns_df(filename, skip_files)

isbns = list(df.index)
with open("data/fiction_isbns.txt", "w") as f:
    [f.write(x+"\n") for x in isbns]