In [2]:
import os
import nltk
import pathlib
import re
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer  
from nltk.tokenize import sent_tokenize
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from textblob import TextBlob
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer("[\w’'']+")
english_stops = stopwords.words('english')

#this should change based on the folder names for each corpus held in the "data" folder
corpora = ["lesbian-fanfic","lesbian-pulp"]

#populate with lists of words that are associated with each other
associated_wordlists = {
    "body": ["body", "breasts", "hips", "lipstick", "hair", "brow", "arms", "legs", "lips","waist"],
    "intimacy": ["intimacy","tenderness", "tender", "loving", "warm", "warmth", "ache", "touch","touching","love"],
    "identity": ["lesbian","homosexual","gay","queer","bisexual","self","herself"]
                        }

In [3]:
"""
The functions in this block are "utility functions," which is to say they perform a task in service
of some other core analytic function, e.g. removing punctuatons, getting function words, or reformatting
results like ngrams.
"""
def tokenizers(text):
    """
    Receives a string and returns a list of the word tokens and sentence tokens, the
    sentence tokens being for the purpose of ngram analysis.
    """
    word_tokens = tokenizer.tokenize(text)
    sent_tokens = sent_tokenize(text)
    return word_tokens, sent_tokens

def get_function_words():
    """
    When run, returns a list of function words that is based on a txt file stored in the "data" folder.
    """
    filepath = pathlib.Path("data/function_words.txt")
    fin = open(filepath)
    result = list()                            
    for line in fin:
        result.append(line.strip())
    return result

def remove_punctuation(text):
    """
    Receives a string and strips any punctuation that is part of string.punctuation, returning
    the punctuationless string.
    """
    text_nopunct = text.translate(text.maketrans('', '', punctuation))
    return text_nopunct

def filter_stops(w):
    """Receives a string and filters out the word if it is less than three characters or a stop word.
    Returns the word if it passes the criteria and nothing if the word does not.
    This is to be used as a filter for the collocations NLTK tool."""
    if len(w) < 3 or w in english_stops:
        return w
    return None

def chapter_segmenter(text):
    """
    Receives read txt file (or, I guess, any string) and returns a list of strings.
    The list's items will each be a chapter in the book assuming they are marked as "chapter X\n"
    such that X is any mix of numbers, letters, periods, or colons. If there are not chapter
    markers, it will simply break the text into 15 roughly equal segments.
    """
    chapters = list()
    chapter_pattern = "chapter [a-z0-9:\.]+(?=\n)"
    chapter_headers = re.findall(chapter_pattern,text)
    
    # checks to ensure there are at least 2 chapter headers, to avoid false positives
    if len(chapter_headers) > 2:
        
        # goes through each chapter header and grabs the text between it and the next chapter header,
        # or (if it's the last chapter) grabs all the text from the header to the end
        for chapter_index in range(len(chapter_headers)):
            if chapter_index < len(chapter_headers) - 1:
                chapter_text_pattern = "(?<={}\n)[\s\S]+(?={}\n)".format(chapter_headers[chapter_index],
                                                                         chapter_headers[chapter_index+1])
                chapter_text = re.search(chapter_text_pattern,text)
                chapters.append(chapter_text.group())
            else:
                chapter_text_pattern = "{}[\s\S]+".format(chapter_headers[chapter_index])
                chapter_text = re.search(chapter_text_pattern,text)
                chapters.append(chapter_text.group())
    
    # if there are no chapter headers, the text is split into 15 roughly equal parts;
    # there is a check for the end of each window to make sure it doesn't split a word or sentence;
    # instead, it finds the nearest newline (to the left in the line) and breaks the text there
    else:
        chapters = list()
        window = int(len(text)/15)
        while len(chapters) < 15:
            if len(chapters) < 14:
                ind = window * len(chapters)
                next_window = ind + window
                newline_ind = text.rfind("\n",ind,next_window)
                chapter = text[ind:next_window]
                chapters.append(chapter)
                ind = newline_ind+1
            else:
                chapters.append(text[ind:])

    return chapters

def freq_result_rewriter(raw_result,column_head):
    """
    In service of writing out results to a csv without extra commas breaking up the results,
    this function receives a list of tuples containing the frequency and word(s) for some kind of
    "top x words/ngrams" result, and returns a string of the words or ngrams separated by spaces.
    If the result is a list of ngram counts, then it puts the words in the ngram within single quotes
    to avoid a lack of clarity of where each ngram starts and ends.
    """
    result = ""
    
    # the least indented if/else determines if it's a "normal" result or if the result is itself
    # a list of results, which would then need to converted from frequency-phrase tuples to a string of
    # the phrases
    if re.search("^top [0-9]+",column_head) != None:
        
        # this if/else checks if it's an ngram, because then the phrase is multiple words and needs
        # to be converted to a string of those words separated by spaces and surrounded by single quotes
        if re.search("grams$",column_head) != None or re.search("collocations$",column_head) != None:
            
            # this for loop goes through the ngram results, grabs the ngram tuple, 
            # and converts it into a string surrounded by single quotes, then adds them to a larger
            # result string
            for ngram_result_index in range(len(raw_result)):
                gram_tuple = raw_result[ngram_result_index][1]
                gram_list = list(gram_tuple)
                gram = " ".join(gram_list)
                gram = "'{}'".format(gram)
                
                # this if/else ensures there is not an extra space after the last word
                if ngram_result_index < len(raw_result)-1:
                    result += gram+" "
                else:
                    result += gram
        else:
            
            # this for loop goes through a non-ngram frequency-phrase result and fetches the phrase,
            # adding it to a larger string of the results separated by spaces
            for word_result_index in range(len(raw_result)):
                word = raw_result[word_result_index][1]
                if word_result_index < len(raw_result)-1:
                        result += word+" "
                else:
                    result += word   
                    
    else:
        result = str(raw_result)
    return result

def get_collocations(words,stops=None):
    """
    Uses NLTK's bigram collocation finder to receive a list of words (and whether or not to filter stop words)
    and find all bigram collocations that appear at least three times.
    Returns a list of tuples that each contain two items: a tuple of the two collocated words, and a number
    representing the number of appearances.
    """    
    bcf = BigramCollocationFinder.from_words(words)
    if stops == False:
        bcf.apply_word_filter(filter_stops)
    elif stops == True:
        pass
    bcf.apply_freq_filter(2)  
    result = list(bcf.ngram_fd.items())
    return result

In [4]:
"""
The functions in this block are file-oriented, which is to say they are involved in converting
strings to file objects (e.g. a folder to a list of filepaths), or in writing out results to a
csv file.
"""

def folder_to_filepaths(folder):
    """
    Receives the name of a folder within a subfolder called "data" and returns a list of
    Path objects, one for each file within the folder.
    """
    folder_path = pathlib.Path("data/{}".format(folder))
    files = folder_path.glob('*.txt')
    file_names = list()
    for file in files:
        file_names.append(file)
    return file_names

def file_reader(filepath):
    """
    Receives a Path object for a file and returns a string of the text in that file.
    """
    with open(filepath,encoding="utf-8") as fin:
        f_text = fin.read().strip().lower()
    return f_text

def csv_creator(folder_name,header_items):
    """
    Receives the folder name for the corpus being analyzed and a list of the column names,
    and then creates a file for that corpus with the header row written in.
    """
    csv_name = "results_{}.csv".format(folder_name)
    csv_path = pathlib.Path(csv_name)
    with open(csv_path,"w") as csv_out:
        counter = 0
        for header_item in header_items:
            if counter < len(header_items)-1:
                csv_out.write(header_item+",")
                counter += 1
            else:
                csv_out.write(header_item+"\n")
                
def csv_creator_sentiments(folder_name,header_items):
    """
    Receives the folder name for the corpus being analyzed and a list of the column names,
    and then creates a file for that corpus with the header row written in. Note that
    the name of the file is different than the regular csv_creator.
    """
    csv_name = "sentiments_{}.csv".format(folder_name)
    csv_path = pathlib.Path(csv_name)
    with open(csv_path,"w") as csv_out:
        counter = 0
        for header_item in header_items:
            if counter < len(header_items)-1:
                csv_out.write(header_item+",")
                counter += 1
            else:
                csv_out.write(header_item+"\n")

def csv_writer(folder_name,results):
    """
    Receives the folder name for the corpus being analyzed and a dictionary of results for 
    a document or corpus, and then writes a new row into the respective corpus results csv 
    with that document's or corpus's results. Note that checks to see if the result is a 
    "top" number of words (e.g. most frequent words, most frequent ngrams), and calls
    the function that reformats the result if so to make sure there are not extra 
    commas while writing out the result.
    """
    csv_name = "results_{}.csv".format(folder_name)
    csv_path = pathlib.Path(csv_name)
    with open(csv_path,"a") as csv_out:
        counter = 0
        
        # this for loop goes through the dictionary of results, fetches the value (so an individual result),
        # and then formats the result
        for key in results:
            unformatted_result = results[key]
            result = freq_result_rewriter(unformatted_result,key)
            
            # this if/else ensures that the line ends with a newline, not a comma
            if counter < len(results)-1:
                csv_out.write(result+",")
                counter += 1
            else:
                csv_out.write(result+"\n")
                
def csv_writer_sentiments(folder_name,results):
    """
    Receives the folder name for the corpus being analyzed and a dictionary of results for 
    a document, with the dictionary mapping a filename to a dictionary that maps chapter
    numbers to sentiment scores. Each row is written out with the filename for a text,
    then the score for each chapter.
    """
    csv_name = "sentiments_{}.csv".format(folder_name)
    csv_path = pathlib.Path(csv_name)
    
    # this for loop finds the text with the most chapters, and makes sure the header for the csv
    # goes up to the highest chapter that will appear
    chapters_max = None
    for text in results:
        if chapters_max == None:
            chapters_max = len(results[text].items())
        else:
            if len(results[text]) > chapters_max:
                chapters_max = len(results[text].items())                
    chapters_header = ["filename"]
    chapters_header.extend([str(i+1) for i in range(chapters_max)])
    csv_creator_sentiments(folder_name,chapters_header)
    
    with open(csv_path,"a") as csv_out:
        # this for loop goes through the dictionary of results, writes out the filename,
        # and then goes through each text's dictionary, writing out each sentiment score
        # for each chapter
        for text in results:
            result = results[text]
            csv_out.write(text+",")
            counter = 0
            # this if/else ensures that the line ends with a newline, not a comma
            while counter < len(result):
                chapter_score = str(result[counter+1])
                if counter < len(result)-1:
                    csv_out.write(chapter_score+",")
                    counter += 1
                else:
                    csv_out.write(chapter_score+"\n")
                    counter += 1

In [5]:
"""
The functions in this block perform the analyses that will be
used in the file and folder processing (or top words/ngrams measures at the corpus level)
and ultimately written out to the results csvs. They focus on finding common words or sequences of words.
"""

def freq_words(words, num_words):
    """
    Receives a list of word tokens and the number of "top words" to find (let's call it x),
    and determines the x most frequently occurring words.
    Returns a list of x tuples, with each containing the count and word associated with it.
    """
    function_words = get_function_words()
    word_counts = dict()
    
    # goes through the tokens list, creating a dictionary that relates the
    # word to how many times it appears, so long as it is not a function word
    for word in words:
        if word not in function_words:
            if word not in word_counts:
                word_counts[word] = 1
            else:
                word_counts[word] += 1
        else:
            pass
    count_word_pairs = list()
    
    # converts the dictionary word-count relationships to a tuple with the count first, then the word
    for key, value in word_counts.items():
        count_word_pairs.append((value,key))
    count_word_pairs.sort(reverse=True)
    return count_word_pairs[0:num_words]

def freq_ngrams(sentences, n, num_ngrams):
    """
    Receives a list of sentence tokens, the n ngrams to find, and the number of "top ngrams" to find (x),
    and determines the x most frequently occuring ngrams, without traversing sentences.
    Returns a list of x tuples, with each containing the count and 
    a tuple of the words in the ngram associated with it.
    """
    ngrams = []
    
    # goes through each sentence, tokenizes it, and then gathers every ngram for the sentence of size n
    for sent in sentences:
        words = tokenizer.tokenize(sent)
        ngram_obj = nltk.ngrams(words, n)
        sent_ngrams = list(ngram_obj)
        ngrams.extend(sent_ngrams)
    ngram_counts = dict()
    
    # goes through each ngram and creates a dictionary relating the ngram to how many times it appears
    for ngram in ngrams:
        if ngram not in ngram_counts:
            ngram_counts[ngram] = 1
        else:
            ngram_counts[ngram] += 1
    count_ngram_pairs = list()
    for key,value in ngram_counts.items():
        count_ngram_pairs.append((value,key))
    count_ngram_pairs.sort(reverse=True)
    
    # this if/else ensures there won't be an error if the number of ngrams is less than the requested number
    if len(count_ngram_pairs) < num_ngrams:
        return count_ngram_pairs
    else:
        return count_ngram_pairs[0:num_ngrams]

In [6]:
"""
The functions in this block perform the analyses that will be
used in the file and folder processing (or top words/ngrams measures at the corpus level)
and ultimately written out to the results csvs. They focus on analyses around pronouns.
"""

def top_pronoun_verb_pairs(words, num_words, pronoun):
    """
    Receives a list of word tokens and determines the top x number of verbs 
    (in lemmatized form) most frequently associated with a given pronoun.
    Returns a list of x tuples, each containing the count and word associated with the pronoun.
    """
    #pos_tokens = nltk.pos_tag(words)
    associated_verbs = list()
    verb_counts = dict()
    pronouns_plus2 = list()
    
    # loops through each word, and if the word is the requested pronoun, it adds that word
    # plus the next two as a list to a larger list of all instances (so, the result is)
    # a list of lists, with each sublist being a set of three words: the pronoun, then the next two words
    # in the original text
    for index in range(len(words)):
        if words[index] == pronoun:
            pronouns_plus2.append(words[index:index+3])
            
    # loops through the list, one set of three words at a time, populating 
    # a list of the verbs following the pronoun
    for pronoun_plus2 in pronouns_plus2:
        pos_tagged = nltk.pos_tag(pronoun_plus2)
        
        # if the word directly after the pronoun is a present/past tense verb, it adds
        # the verb in its base form to the list of verbs
        if pos_tagged[1][1] in ["VB", "VBD", "VBZ"]:
            lemma_form = lemmatizer.lemmatize(pos_tagged[1][0],"v")
            associated_verbs.append(lemma_form)
            
        # if the second word after the pronoun is a participle or gerund, it adds it
        # to make sure things like "she is walking" have "walk" added to the list of verbs
        if pos_tagged[2][1] in ["VBG", "VBN"]:
            lemma_form = lemmatizer.lemmatize(pos_tagged[2][0],"v")
            associated_verbs.append(lemma_form)
            
    # loops through the associated verbs and creates a dictionary counting the number of occurrences
    for verb in associated_verbs:
        if verb not in verb_counts:
            verb_counts[verb] = 1
        else:
            verb_counts[verb] += 1
    verb_count_list = list()
    for verb,count in verb_counts.items():
        verb_count_list.append((count,verb))
    verb_count_list.sort(reverse=True)
    return verb_count_list[0:num_words]

def pronoun_subj_ratio(words, pronoun_subj, pronoun_obj):
    """
    Receives a list of word tokens, the subj version of a pronoun (e.g. "she"), 
    and a list of the object versions (e.g. ["her", "hers"]).
    I chose this format because it does not predetermine the available pronouns.
    Returns a ratio in decimal form with the subject pronoun as the numerator and 
    all pronoun counts as the denominator. So, the higher the ratio, the higher frequency of subject pronoun.
    """
    subj_count = 0
    obj_count = 0
    for word in words:
        if word == pronoun_subj:
            subj_count += 1
        elif word in pronoun_obj:
            obj_count += 1
    subj_ratio = subj_count / (obj_count + subj_count)
    return subj_ratio

In [7]:
"""
The functions in this block perform the analyses that will be
used in the file and folder processing (or top words/ngrams measures at the corpus level)
and ultimately written out to the results csvs. They focus on analyses involving collocations and sentiment.
"""

def get_top_collocations(coll_list,num_colls):
    """
    Receives a list of collocations and the number of top results to fetch.
    Returns the top num_colls collocations.
    """
    rev_colls = list()
    
    #rewrite results so they can be sorted by count)
    for coll, count in coll_list:
        rev_colls.append((count, coll))
    rev_colls.sort(reverse=True)
    
    return rev_colls[0:num_colls]

def coll_associations(coll_list, associated_terms, num_results):
    """
    Receives a list of collocations, a list of terms to find associated words for, 
    and a number of desired top results.
    Returns a list of the top num_results words that are collocated with the seeded terms, in lemma form.
    """
    collocated_term_counts = dict()
    
    associated_lemmas = list()
    #turn the list of terms to find collocations for into lemmatized forms
    for term in associated_terms:
        lemma = lemmatizer.lemmatize(term)
        associated_lemmas.append(lemma)

    
    #loop through the items in the list of collocations to see if either word (in lemma form)
    #is in the list of lemmas for which associated collocations are being sought; 
    #if so, it adds the associated collocation (not the seed term) to a dictionary, and adds the count
    for terms, count in coll_list:
        term_lemmas = (lemmatizer.lemmatize(terms[0]), lemmatizer.lemmatize(terms[1]))
        if term_lemmas[0] in associated_lemmas:
            associated_lemma = term_lemmas[1]
        elif term_lemmas[1] in associated_lemmas:
            associated_lemma = term_lemmas[0]
        else:
            associated_lemma = None
        #checks to see if associated lemma is in the dictionary already; if not, it sets the count
        #to the collocation count; if so, it adds the count to the existing count
        if associated_lemma != None and associated_lemma not in collocated_term_counts:
            collocated_term_counts[associated_lemma] = count
        elif associated_lemma != None and associated_lemma in collocated_term_counts:
            collocated_term_counts[associated_lemma] += count
            
            
    count_coll_pairs = list()
    for key,value in collocated_term_counts.items():
        count_coll_pairs.append((value,key))
    count_coll_pairs.sort(reverse=True) 
    
    return count_coll_pairs[0:num_results]

def sentiment_analysis(text):
    """
    Receives a read txt file (or any string) and returns two items:
    1) a sentiment polarity for the entire text;
    2) a dictionary mapping a chapter number to that chapter's sentiment polarity.
    """
    chapter_scores = dict()
    chapters = chapter_segmenter(text)
    chapter_number = 1
    
    # loops through each chapter, creates a TextBlob object, and determines the polarity of the chapter;
    # then, it adds the polarity to a dictionary mapping chapter number to sentiment polarity
    for chapter in chapters:
        tbc = TextBlob(chapter)
        chapter_sentiment_polarity = tbc.sentiment.polarity
        chapter_scores[chapter_number] = chapter_sentiment_polarity
        chapter_number += 1
    tbt = TextBlob(text)
    text_polarity = tbt.sentiment.polarity
    return text_polarity, chapter_scores

In [12]:
"""
The functions in this block collate the file-level and corpus-level analytic functions,
as well as the file input-output functions, taking the code from start (folder names representing
corpora) to finish (writing out the results to csvs). In a sense, they provide the infrastructure
by which the more tool- and task-oriented functions come together to perform a set of analyses
and share the results with the user.
"""

def file_analysis(filepath):
    """
    Receives a Path object for a file, tokenizes it, and returns:
    1) a dictionary of the results, associating the name of a measure or result with the result,
    acquired by running the relevant analytic function.
    2) a tuple of the word and sentence tokens for use in the corpus-level analysis.
    """
    doc_results = dict()
    text = file_reader(filepath)
    word_tokens, sentence_tokens = tokenizers(text)
    collocations = get_collocations(word_tokens, stops = False)
    text_sentiment, chapter_sentiments = sentiment_analysis(text)
    doc_results["filename"] = filepath.name
    doc_results["top 10 words"] = freq_words(word_tokens,10)
    doc_results["top 10 bigrams"] = freq_ngrams(sentence_tokens, 2, 10)
    doc_results["top 10 verbs_'she'"] = top_pronoun_verb_pairs(word_tokens, 10, "she")
    doc_results["top 10 verbs_'he'"] = top_pronoun_verb_pairs(word_tokens, 10, "he")
    doc_results["top 10 verbs_'they'"] = top_pronoun_verb_pairs(word_tokens, 10, "they")
    doc_results["pronoun ratio_'she'"] = pronoun_subj_ratio(word_tokens, "she", ["her","hers"])
    doc_results["pronoun ratio_'he'"] = pronoun_subj_ratio(word_tokens, "he", ["his","him"])
    doc_results["pronoun ratio_'they'"] = pronoun_subj_ratio(word_tokens, "they", ["their","theirs"])  
    doc_results["top 10 collocations"] = get_top_collocations(collocations, 10)
    doc_results["sentiment polarity"] = text_sentiment
    
    #add collocated related terms results by looping through lists of associated terms
    for wordlist_key in associated_wordlists:
        coll_header = "top 10 coll - {}".format(wordlist_key)
        doc_results[coll_header] = coll_associations(collocations, associated_wordlists[wordlist_key], 10)

    
    # this passes the tokens as a result so it can be used in analyzing the corpus as a whole, as
    # well as the collocations and dictionary of results mapping chapter numbers to sentiment scores
    tokens = (word_tokens, sentence_tokens)
    return doc_results, tokens, collocations, chapter_sentiments

def process_files(folder):
    """
    Receives a folder name for a corpus, acquires a list of all files within it, 
    analyzes each using file_analysis(), and generates:
    1) a list containing the dictionaries full of results for each documents
    2) a tuple containing the combined word tokens and combined sentence tokens for the corpus
    """
    file_objs = folder_to_filepaths(folder)
    files_results = list()
    corpus_word_tokens = list()
    corpus_sentence_tokens = list()
    corpus_collocations = list()
    corpus_sentiments = dict()
    
    # goes through the file path objects in a folder, does file_analysis, adds the dictionary of results
    # to a list, and creates lists of all tokens (word and sentence) in the corpus
    for file_obj in file_objs:
        file_result, file_tokens, file_colls, file_sents = file_analysis(file_obj)
        files_results.append(file_result)
        corpus_word_tokens.extend(file_tokens[0])
        corpus_sentence_tokens.extend(file_tokens[1])
        corpus_collocations.extend(file_colls)
        
        # this line builds the dictionary that is passed to csv_writer_sentiments, with the name
        # of the file being mapped to the dictionary of results mapping chapter num to sent. polarity
        corpus_sentiments[file_obj.stem] = file_sents
    corpus_tokens = (corpus_word_tokens,corpus_sentence_tokens)
    
    # passes the list of dictionaries and the tuple of corpus tokens for corpus-level analysis, as well as
    # the collocations for the entire corpus and dictionary of filenames and sentiment scores
    return files_results, corpus_tokens, corpus_collocations, corpus_sentiments

def process_folder(folder):
    """
    Receives a folder name for a corpus, retrieves the list of results/corpus-level tokens for it
    using process_files(), retrieves the results at the corpus level, and appends the corpus-level
    results to the list of results, returning this "complete" list of results for the corpus and
    documents within it.
    """
    documents_results, documents_tokens, documents_collocations, documents_sentiments = process_files(folder)
    print("file-level analysis done for corpus: {}".format(folder))
    folder_top10_freq = freq_words(documents_tokens[0], 10)
    folder_top10_bigrams = freq_ngrams(documents_tokens[1], 2, 10)
    folder_top10_verbs_she = top_pronoun_verb_pairs(documents_tokens[0], 10, "she")
    folder_top10_verbs_he = top_pronoun_verb_pairs(documents_tokens[0], 10, "he")
    folder_top10_verbs_they = top_pronoun_verb_pairs(documents_tokens[0], 10, "they")
    folder_pronoun_ratio_she = pronoun_subj_ratio(documents_tokens[0], "she", ["her","hers"])
    folder_pronoun_ratio_he = pronoun_subj_ratio(documents_tokens[0], "he", ["his","him"])
    folder_pronoun_ratio_they = pronoun_subj_ratio(documents_tokens[0], "they", ["their","theirs"])
    folder_top10_collocations = get_top_collocations(documents_collocations, 10)

    
    # I realize I could've done this in the above lines, but I wanted to make it clear how
    # the corpus analysis uses mostly different or modified functions for the results and adds that
    folder_result = {"filename": folder, "top 10 words": folder_top10_freq, 
                     "top 10 bigrams": folder_top10_bigrams, 
                     "top 10 verbs_'she'": folder_top10_verbs_she, 
                     "top 10 verbs_'he'": folder_top10_verbs_he,
                     "top 10 verbs_'they'": folder_top10_verbs_they,
                     "pronoun ratio_'she'": folder_pronoun_ratio_she, 
                     "pronoun ratio_'he'": folder_pronoun_ratio_he,
                     "pronoun ratio_'they'": folder_pronoun_ratio_they,
                     "top 10 collocations": folder_top10_collocations,
                     #the polarity of the corpus doesn't seem useful, but I don't want to leave the row blank
                     "sentiment polarity": "na"}

    coll_associations_results = dict()
    #add collocated related terms results by looping through lists of associated terms, creating dictionary
    #so that the specific name can be written out in the header row for the csv
    for wordlist_key in associated_wordlists:
        coll_header = "top 10 coll - {}".format(wordlist_key)
        folder_result[coll_header] = coll_associations(documents_collocations, 
                                                                    associated_wordlists[wordlist_key], 
                                                                    10)
    
    print("corpus-level analysis done for corpus: {}".format(folder))
    documents_results.append(folder_result)
    
    #passes documents_sentiments forward to be written out in the run() function
    return documents_results, documents_sentiments

def write_results(folder):
    """
    Receives a folder name, runs process_folder() on it (which in turn performs the analyses per 
    document and corpus, and collates the results into a list of dictionaries), creates the
    results csv for the folder with a header row, and then writes each dictionary of results
    as a row.
    """
    corpus_results, corpus_sentiments = process_folder(folder)
    header = list(corpus_results[0].keys())
    csv_creator(folder,header)
    
    # note that this for loop goes through the list of dictionaries of results and, for each one,
    # writes them out to the csv, line by line
    for document_result in corpus_results:
        csv_writer(folder, document_result)
        
    csv_writer_sentiments(folder,corpus_sentiments)
    
    print("results written out to csvs for corpus: {}\n--------------------".format(folder))
    #added this so I can do some analytics as seen at the end of this notebook
    return corpus_results

def run():
    """
    Iterates through the corpora as defined at the top, which should reflect the folder names in the
    "data" folder, and runs write_results() on it (which in turn processes the files, acquires the
    results, and writes the results out to a csv file).
    """
    global res_dict
    res_dict = dict()
    for corpus in corpora:
        res = write_results(corpus)
        res_dict[corpus] = res
    print("done!!")

In [13]:
run()

file-level analysis done for corpus: lesbian-fanfic
corpus-level analysis done for corpus: lesbian-fanfic
results written out to csvs for corpus: lesbian-fanfic
--------------------
file-level analysis done for corpus: lesbian-pulp
corpus-level analysis done for corpus: lesbian-pulp
results written out to csvs for corpus: lesbian-pulp
--------------------
done!!


In [None]:
"""
The functions in this block run some basic analytics on results to help me
find inroads for my analysis. It's mostly just to help me and I wouldn't include it were I to share
this code, but I am temporarily including it here.
"""
commonalities = dict()

for text in res_dict["lesbian-fanfic"]:
    for result in text:
        commonalities[result] = None
        


def read_csv(corpus_name):
    filename = "results_{}.csv".format(corpus_name)
    filepath = pathlib.Path(filename)
    corpus_rows = list()
    with open(filepath,"r") as file_obj:
        content = file_obj.read()
    rows = content.split("\n")
    corpus_rows.extend(rows[0:-1])
    return corpus_rows



for corpus in corpora:
    csv_dict = dict()
    rows = read_csv(corpus)
    csv_dict["corpus"] = corpus
    column_heads = rows[0].split(",")
    relevant_headers = column_heads[1:2]+column_heads[3:6]+column_heads[9:]
    for column_head in relevant_headers:
        csv_dict[column_head] = None
    for row in rows[1:]:
        split_row = row.split(",")
        split_row = split_row[1:6]+split_row[9:]
        split_row.remove(split_row[1])
        for column_ind in range(len(split_row)):
            result = []
            for word in split_row[column_ind].split():
                result.append(word)
            if csv_dict[relevant_headers[column_ind]] == None:
                csv_dict[relevant_headers[column_ind]] = result
            else:
                csv_dict[relevant_headers[column_ind]].extend(result)
    print(corpus+"\n")
    for relevant_header in relevant_headers:
        if relevant_header != "top 10 collocations":
            print(relevant_header)
            counts = {}
            counts_bynum = []
            res_list = csv_dict[relevant_header]
            for word in res_list:
                if word not in counts:
                    counts[word] = 1
                else:
                    counts[word] += 1
            for key in counts:
                if counts[key] >= 3:
                    counts_bynum.append((counts[key],key))
            counts_bynum.sort(reverse=True)
            print(counts_bynum)
            print("----------------------------------------")
    print("\n")