In [19]:
import pandas as pd
import csv

def print_csv(file): # helper function to debug
    with open(file, 'r') as csvfile:
        csv_reader = csv.reader(csvfile)
        for row in csv_reader:
            print(row)

In [20]:
character_names = [
    "Aaron", "Abel", "Abigail", "Abraham", "Absalom", "Achan", "Adam", "Adonijah", 
    "Amos", "Andrew", "Anna", "Apollos", "Aquila", "Asa", "Asher", "Barnabas", 
    "Bartholomew", "Boaz", "Cain", "Cornelius", "Cyrus", "Daniel", "David", "Deborah", 
    "Elijah", "Elisha", "Elizabeth", "Esau", "Esther", "Eve", "Ezekiel", "Felix", 
    "Gideon", "Hannah", "Hiram", "Hosea", "Huldah", "Isaac", "Isaiah", "Jacob", 
    "James", "Japheth", "Jared", "Jairus", "Jeremaih", "Jesus", "Job", "Joel", 
    "John", "John the Baptist", "Jonah", "Jonathan", "Joseph", "Joshua", "Josiah", 
    "Judah", "Jude", "Junia", "Lazarus", "Levi", "Lot", "Luke", "Lydia", "Manasseh", 
    "Martha", "Mary", "Mary Magdalene", "Matthias", "Melchizedek", "Micah", "Miriam", 
    "Mordecai", "Moses", "Nahum", "Nathan", "Nathanael", "Nehemiah", "Nicodemus", 
    "Noah", "Obadiah", "Paul", "Peter", "Philip", "Priscilla", "Rachel", "Rahab", 
    "Rhoda", "Ruth", "Samson", "Samuel", "Sarah", "Saul", "Seth", "Shem", "Silas", 
    "Simon", "Solomon", "Thomas", "Zechariah"
]

bible_paths = {
    "asv": "../bibles_chunked/asv.csv",
    "fbv": "../bibles_chunked/fbv.csv",
    "web": "../bibles_chunked/web.csv",
    "wmb": "../bibles_chunked/wmb.csv",
    "kjv": "../bibles_chunked/kjv.csv"
}

test_paths = {
    "asv": "../../manual_annotation/nick/asv.csv",
    "fbv": "../../manual_annotation/nick/fbv.csv",
    "web": "../../manual_annotation/nick/web.csv",
    "wmb": "../../manual_annotation/nick/wmb.csv",
    "kjv": "../../manual_annotation/nick/kjv.csv"
}

In [21]:
# checks if chunk mentions any of the above names
def mentions_character(text):
    for name in character_names:
        if name in text:
            return True
    return False

# filter chunks with mentions_character() helper function
def filter_chunks(df):
    return df[df['text'].apply(mentions_character)]

# make csv with 10 random chunks (mentioning specific characters) for each bible
def generate_filtered_csv(character_names, num_chunks):
    for bible_name, path in bible_paths.items():
        df = pd.read_csv(path)
        filtered_df = filter_chunks(df)
        sampled_df = filtered_df.sample(num_chunks)
        sampled_df.to_csv(f"{bible_name}.csv", index=False) # export

def filter_by_character(character_names):
    """
    filters bible chunks mentioning any character and creates a table with columns 
    for each bible translation containing chunk and respective sentiment values.
    """
    data = {}
    for bible_name, path in test_paths.items():
        df = pd.read_csv(path)
        filtered_df = filter_chunks(df)
        data[bible_name] = filtered_df[['chunk', 'sentiment']]
    result_df = pd.DataFrame(data)
    return result_df

def filter_by_char_no_kjv(character_names):
    ''' same as above but omitting KJV '''
    data = {}
    for bible_name, path in test_paths.items():
        if bible_name != 'kjv':
            df = pd.read_csv(path)
            filtered_df = filter_chunks(df)
            data[bible_name] = filtered_df[['chunk', 'sentiment']]
    result_df = pd.DataFrame(data)
    return result_df

In [22]:
# generate_filtered_csv(character_names, num_chunks=10)
# filter_by_character(character_names)
# print_csv("../../manual_annotation/nick/asv.csv")
