In [1]:
# Import spacy
import spacy

# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

In [2]:
# Create empty lists for file names and contents
texts = []
file_names = []

# Iterate through each file in the folder
for _file_name in os.listdir('eliot_poems'):
# Look for only text files
    if _file_name.endswith('.txt'):
    # Append contents of each text file to text list
        texts.append(open('eliot_poems' + '/' + _file_name, 'r', encoding='utf-8').read())
        # Append name of each file to file name list
        file_names.append(_file_name)

In [5]:
#import pandas
import pandas as pd

# Create dictionary object associating each file name with its text
df = {'Filename':file_names,'Text':texts}

In [6]:
# Turn dictionary into a dataframe
poem_df = pd.DataFrame(df)

In [7]:
poem_df.head()

Unnamed: 0,Filename,Text
0,Macavity_The_Mystery_Cat.txt,Macavity’s a Mystery Cat: he’s called the Hidd...
1,Aunt_Helen.txt,"Miss Helen Slingsby was my maiden aunt,\nAnd l..."
2,The_Dry_Salvages.txt,I do not know much about gods; but I think tha...
3,La_Figlia_Che_Piange.txt,Stand on the highest pavement of the stair—\nL...
4,Whispers_of_Immortality.txt,Webster was much possessed by death\nAnd saw t...


In [8]:
# Remove extra spaces from papers
poem_df['Text'] = poem_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
poem_df.head()

Unnamed: 0,Filename,Text
0,Macavity_The_Mystery_Cat.txt,Macavity’s a Mystery Cat: he’s called the Hidd...
1,Aunt_Helen.txt,"Miss Helen Slingsby was my maiden aunt, And li..."
2,The_Dry_Salvages.txt,I do not know much about gods; but I think tha...
3,La_Figlia_Che_Piange.txt,Stand on the highest pavement of the stair— Le...
4,Whispers_of_Immortality.txt,Webster was much possessed by death And saw th...


In [9]:
# Load metadata
metadata_df = pd.read_csv('metadata.csv')
metadata_df.head()

Unnamed: 0,Title,Year
0,The_Waste_Land,1922
1,The_Hollow_Men,1925
2,The_Love_Song_of_J.Alfred_Prufrock,1915
3,Burnt_Norton,1936
4,Ash_Wednesday,1930


In [10]:
# Remove .txt from title of each poem
poem_df['Filename'] = poem_df['Filename'].str.replace('.txt', '', regex=True)

# Rename column from paper ID to Title
metadata_df.rename(columns={"Title": "Filename"}, inplace=True)

In [11]:
# Merge metadata and poems into new DataFrame
final_poem_df = metadata_df.merge(poem_df,on='Filename')

In [12]:
# Print DataFrame
final_poem_df.head()

Unnamed: 0,Filename,Year,Text
0,The_Waste_Land,1922,I. The Burial of the Dead April is the cruelle...
1,The_Hollow_Men,1925,I We are the hollow men We are the hollow men ...
2,The_Love_Song_of_J.Alfred_Prufrock,1915,"Let us go then, you and I, When the evening is..."
3,Burnt_Norton,1936,I Time present and time past Are both perhaps ...
4,Ash_Wednesday,1930,I Because I do not hope to turn again Because ...


### Creating Doc Objects ###

In [13]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

In [14]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [15]:
# Apply the function to the "Text" column
final_poem_df['Text'] = final_poem_df['Text'].apply(process_text)

### Tokenization ###

In [16]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

In [18]:
# Run the token retrieval function on the doc objects in the dataframe
final_poem_df['Tokens'] = final_poem_df['Text'].apply(get_token)
final_poem_df.head()

Unnamed: 0,Filename,Year,Text,Tokens
0,The_Waste_Land,1922,"(I., The, Burial, of, the, Dead, April, is, th...","[I., The, Burial, of, the, Dead, April, is, th..."
1,The_Hollow_Men,1925,"(I, We, are, the, hollow, men, We, are, the, h...","[I, We, are, the, hollow, men, We, are, the, h..."
2,The_Love_Song_of_J.Alfred_Prufrock,1915,"(Let, us, go, then, ,, you, and, I, ,, When, t...","[Let, us, go, then, ,, you, and, I, ,, When, t..."
3,Burnt_Norton,1936,"(I, Time, present, and, time, past, Are, both,...","[I, Time, present, and, time, past, Are, both,..."
4,Ash_Wednesday,1930,"(I, Because, I, do, not, hope, to, turn, again...","[I, Because, I, do, not, hope, to, turn, again..."


### Lemmatization ###

In [20]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
final_poem_df['Lemmas'] = final_poem_df['Text'].apply(get_lemma)

### Part of Speech

In [22]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
final_poem_df['POS'] = final_poem_df['Text'].apply(get_pos)

### Download Dataset

In [23]:
# Save DataFrame as csv
final_poem_df.to_csv('eliot_poems.csv')