# Language Analytics - Assignment 1 - Extracting linguistic features using spaCy


### Loading data

In [2]:
import os
import re
import spacy
import numpy as np
import pandas as pd

In [3]:
nlp = spacy.load("en_core_web_md")

In [4]:
data_path = "../input/USEcorpus/USEcorpus"

In [5]:
dirs = sorted(os.listdir(data_path))

### Calculating relative frequency of  Nouns, Verbs, Adjective and Adverbs PER 10,000 words

In [6]:
def relative_frequencies(doc):
    # create empty lists
    noun_count = 0
    verb_count = 0
    adjective_count = 0
    adverb_count = 0
    
    # find frequencies 
    for token in doc:
        if token.pos_ == "NOUN":
            noun_count += 1
        elif token.pos_ == "VERB":
            verb_count += 1
        elif token.pos_ == "ADJ":
            adjective_count += 1
        elif token.pos_ == "ADV":
            adverb_count += 1 
        else:
            pass 

    # find the relative frequency per 10,000 words  
    relative_freq_noun = round(((noun_count/len(doc)) * 10000), 2)
    relative_freq_verb = round(((verb_count/len(doc)) * 10000), 2) 
    relative_freq_adjective = round(((adjective_count/len(doc))* 10000), 2) 
    relative_freq_adverb = round(((adverb_count/len(doc)) * 10000), 2)

    return(relative_freq_noun, relative_freq_verb, relative_freq_adjective, relative_freq_adverb)

### Counting the total number of unique PER, LOC and ORG entities

In [7]:

def unique_entities(doc):
     # create empty lists
    PER_ent = []
    LOC_ent = []
    ORG_ent = []
    
    # find entities 
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            PER_ent.append(ent.text)
        elif ent.label_ == "LOC":
            LOC_ent.append(ent.text)
        elif ent.label_ == "ORG":
            ORG_ent.append(ent.text)
        else:
            pass 

    # find the unique entities  
    PER_ent_count = len(np.unique(PER_ent))
    LOC_ent_count = len(np.unique(LOC_ent))
    ORG_ent_count = len(np.unique(ORG_ent))
    
    return(PER_ent_count, LOC_ent_count, ORG_ent_count)
    


### Function for creating 14 tables 

In [8]:
for directory in dirs:
    subfolder = os.path.join(data_path, directory)
    filenames = sorted(os.listdir(subfolder))
    data = []
    

    # create file path 
    for text_file in filenames:
        filepath = subfolder + "/" + text_file

        # load file
        with open(filepath, encoding = "latin-1") as f:
            text = f.read()

        # remove metadata
        text = re.sub(pattern = r'<.*?>', repl = " ", string = text)

        # create spaCy doc
        doc = nlp(text)

        # extract relative frequency 
        relative_frequency = relative_frequencies(doc)
        # extract unique entities 
        unique_entity = unique_entities(doc)

        # create tuples using append
        data.append((text_file, relative_frequency[0], relative_frequency[1], relative_frequency[2], relative_frequency[3], unique_entity[0], unique_entity[1], unique_entity[2]))

    dataframe = pd.DataFrame(data, 
                    columns=["Filename", "RelFreq NOUN", "RelFreq VERB", "RelFreq ADJ", "RelFreq ADV", "No. Unique PER", "No. Unique LOC", "No. Unique ORG"])
    # save dataframe
    outpath = os.path.join("..", "output", f"{directory}.csv")
    dataframe.to_csv(outpath)