# Assignment 1

From the command line, first make sure to run the setup script to install requirements:

```shell 
bash setup.sh
```

#### Importing packages

In [61]:
import spacy
import os
import re
import pandas as pd

nlp = spacy.load("en_core_web_md")

## Loop over each text file in "in" folder

In [57]:

# creating a function to clean each text - *** might be issue with this cleaning ***
def clean_text(text):
    """
    A function to clean each text making it easier and limiting issues later
    Args:
        text (string): Cleaning the texts, replacing the \n - new lines and \t - tabs and the text between the <> characters with just a character of white space 

    Returns:
        text (string): the cleaned text  
    """
    text = re.sub("[\n\t]", " ", text)
    text = re.sub("<.*?>", " ", text)
    return text 


## Find the relative frequency of Nouns, Verbs, Adjective, and Adverbs per 10,000 words

In [58]:
def rel_freq(doc):
    """ Finds the relative frequencies of specified parts of speech (POS) 

    Args:
        doc (nlp object):
        Creating a counter for each POS
        For loop which will add 1 to each counter for the specified POS 
        Calculating the relative frequency per 10,000 words, rounding it to 2 decimals points for ease 
    Returns:
        _type_(float): four floats 
    """ 
    noun_count = 0
    verb_count = 0
    adjective_count = 0
    adverb_count = 0
    for token in doc:
        if token.pos_ == "NOUN":
            noun_count +=1
        if token.pos_ == "VERB":
            verb_count +=1
        if token.pos_ == "ADJ":
            adjective_count +=1
        if token.pos_ == "ADV":
            adverb_count +=1
    relative_freq_n = round((noun_count/len(doc)) * 10000,2)
    relative_freq_v = round((verb_count/len(doc)) * 10000, 2)
    relative_freq_adj = round((adjective_count/len(doc)) * 10000, 2)
    relative_freq_adv = round((adverb_count/len(doc)) * 10000, 2)
    return relative_freq_n, relative_freq_v, relative_freq_adj, relative_freq_adv


# Find the total number of *unique* PER, LOC, ORGS

In [59]:
def unique_ent(doc): 
    """
    Calculates the amount of unique entities in an nlp object
    Args:
        doc (nlp object): creates an empty list
        for each of the three entities, adds them to their specified list
        counts the total of unique entities of each type specified 

    Returns:
        doc (nlp object): integers for each entity type 
    """
    person = []
    loc = []
    orgs = []
 
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            person.append(ent.text)
        if ent.label_ == "LOC":
            loc.append(ent.text)
        if ent.label_ == "ORG":
            orgs.append(ent.text)
    # give the unique NER per each category 
    unique_person_count = len(set(person))
    unique_loc_count = len(set(loc))
    unique_orgs_count = len(set(orgs))
    return unique_person_count, unique_loc_count, unique_orgs_count
 

In [60]:

path = os.path.join("..", "in", "USEcorpus")
dir = os.listdir(path)

for folder in dir:
    # going into every folder within the USEcorpus directory 
    folder_path = os.path.join("..", "in", "USEcorpus", f'{folder}') 
    # saves the list of all files and directories in the path we just named above 
    folder_dir = os.listdir(folder_path)
    # making a large empty dataframe with the columns specified  
    df = pd.DataFrame(columns=["Filename", "RelFreq NOUN", "RelFreq VERB", "RelFreq ADJ", "RelFreq ADV", "Unique PERSON", "Unique LOC", "Unique ORG"])
    # for loop of what to do with each file using the above functions 
    for file in folder_dir:
        # selelcting each file in each folder 
        file_path = folder_path = os.path.join("..", "in", "USEcorpus", f'{folder}', f'{file}') 
        # reading each file 
        with open(file_path, "r", encoding="latin-1") as f:
            raw_text = f.read() 
        # cleaning function
        text = clean_text(raw_text) 
        # making nlp object
        doc = nlp(text)  
        # relative frequency function
        relative_freq_n, relative_freq_v, relative_freq_adj, relative_freq_adv = rel_freq(doc) 
        # unique entity function
        unique_person_count, unique_loc_count, unique_orgs_count = unique_ent(doc) 
        # creating a new object with all the variables 
        new_dat = (file, relative_freq_n, relative_freq_v, relative_freq_adj, relative_freq_adv, unique_person_count, unique_loc_count, unique_orgs_count)
        # creating a pandas data frame with the new_dat object we just created for each file to fill 
        file_df = pd.DataFrame([new_dat],columns=["Filename", "RelFreq NOUN", "RelFreq VERB", "RelFreq ADJ", "RelFreq ADV", "Unique PERSON", "Unique LOC", "Unique ORG"])
        # concatenating the empty df data frame with the new file_df data frame and assigning it all to the df data frame; ignoring index to avoid issues with indexing them incorrectly 
        df = pd.concat([df, file_df], ignore_index=True)  
    # creating an outpath for each folder 
    outpath = os.path.join("..", "out", f"{folder}.csv")
    # turning each data frame into a .csv file and saving it to each folder it belongs to 
    df.to_csv(outpath)