In [1]:
import spacy

# Load spaCy visualizer
from spacy import displacy

# Import os to upload documents and metadata
import os

# Import pandas DataFrame packages
import pandas as pd

# Import graphing package
import plotly.graph_objects as go
import plotly.express as px

# Import drive and files to facilitate file uploads
from google.colab import files

In [2]:
uploaded_files = files.upload()

Saving ENG.01.txt to ENG.01.txt
Saving ENG.02.txt to ENG.02.txt
Saving ENG.03.txt to ENG.03.txt


In [3]:
paper_df = pd.DataFrame.from_dict(uploaded_files, orient='index')
paper_df.head()

Unnamed: 0,0
ENG.01.txt,b'\n\n\n\n THE ADVENTUR...
ENG.02.txt,b'\n\n\n\n THE ADVENTURE ...
ENG.03.txt,b'\n\n\n\n THE ADVENTURE...


In [4]:
paper_df = paper_df.reset_index()
paper_df.columns = ["Filename", "Text"]

In [5]:
paper_df['Text'] = paper_df['Text'].str.decode('utf-8')
paper_df.head()

Unnamed: 0,Filename,Text
0,ENG.01.txt,\n\n\n\n THE ADVENTURE ...
1,ENG.02.txt,\n\n\n\n THE ADVENTURE OF...
2,ENG.03.txt,\n\n\n\n THE ADVENTURE O...


In [6]:
paper_df['Text'] = paper_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()

In [7]:
metadata = files.upload()

Saving SHERLOCK1-3.csv to SHERLOCK1-3.csv


In [9]:
metadata_df = pd.read_csv('SHERLOCK1-3.csv')

In [10]:
# check whether csv. read successfully or not
print(metadata_df)

  story_id                            story_name  discipline       type
0   ENG.01     THE ADVENTURE OF THE THREE GABLES  literature  detective
1   ENG.02  THE ADVENTURE OF THE THREE GARRIDEBS  literature  detective
2   ENG.03   THE ADVENTURE OF THE THREE STUDENTS  literature  detective


In [11]:
metadata_df = metadata_df.dropna(axis=1, how='all')

In [12]:
paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '')

# Rename column from paper ID to Title
metadata_df.rename(columns={"story_id": "Filename"}, inplace=True)

  paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '')


In [13]:
final_paper_df = metadata_df.merge(paper_df,on='Filename')

In [14]:
# check whether merge successfully or not
print(final_paper_df)

  Filename                            story_name  discipline       type  \
0   ENG.01     THE ADVENTURE OF THE THREE GABLES  literature  detective   
1   ENG.02  THE ADVENTURE OF THE THREE GARRIDEBS  literature  detective   
2   ENG.03   THE ADVENTURE OF THE THREE STUDENTS  literature  detective   

                                                Text  
0  THE ADVENTURE OF THE THREE GABLES Arthur Conan...  
1  THE ADVENTURE OF THE THREE GARRIDEBS Arthur Co...  
2  THE ADVENTURE OF THE THREE STUDENTS Arthur Con...  


In [15]:
nlp = spacy.load('en_core_web_sm')

print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [16]:
sentence = "This is 'an' example? sentence"

doc = nlp(sentence)

for token in doc:
    print(token.text, token.pos_)

This PRON
is AUX
' PUNCT
an DET
' PUNCT
example NOUN
? PUNCT
sentence NOUN


In [17]:
# Adjust the max_length limit
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000

def process_text(text):
    return nlp(text)
final_paper_df['Doc'] = final_paper_df['Text'].apply(process_text)

In [18]:
def get_token(doc):
    for token in doc:
        return token.text
final_paper_df['Tokens'] = final_paper_df['Doc'].apply(get_token)

In [19]:
print(final_paper_df)

  Filename                            story_name  discipline       type  \
0   ENG.01     THE ADVENTURE OF THE THREE GABLES  literature  detective   
1   ENG.02  THE ADVENTURE OF THE THREE GARRIDEBS  literature  detective   
2   ENG.03   THE ADVENTURE OF THE THREE STUDENTS  literature  detective   

                                                Text  \
0  THE ADVENTURE OF THE THREE GABLES Arthur Conan...   
1  THE ADVENTURE OF THE THREE GARRIDEBS Arthur Co...   
2  THE ADVENTURE OF THE THREE STUDENTS Arthur Con...   

                                                 Doc Tokens  
0  (THE, ADVENTURE, OF, THE, THREE, GABLES, Arthu...    THE  
1  (THE, ADVENTURE, OF, THE, THREE, GARRIDEBS, Ar...    THE  
2  (THE, ADVENTURE, OF, THE, THREE, STUDENTS, Art...    THE  


In [20]:
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

final_paper_df['Lemmas'] = final_paper_df['Doc'].apply(get_lemma)

In [22]:
# check the frequecny of word "murder" in text tokens and lemmas
print(f'"Murder" appears in the text tokens column ' + str(final_paper_df['Tokens'].apply(lambda x: x.count('write')).sum()) + ' times.')
print(f'"Murder" appears in the lemmas column ' + str(final_paper_df['Lemmas'].apply(lambda x: x.count('write')).sum()) + ' times.')

"Murder" appears in the text tokens column 0 times.
"Murder" appears in the lemmas column 9 times.


In [23]:
def get_pos(doc):
    return [(token.pos_, token.tag_) for token in doc]

final_paper_df['POS'] = final_paper_df['Doc'].apply(get_pos)

In [24]:
list(final_paper_df['POS'])

[[('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NUM', 'CD'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('PRON', 'PRP'),
  ('AUX', 'VBP'),
  ('PART', 'RB'),
  ('VERB', 'VB'),
  ('SCONJ', 'IN'),
  ('PRON', 'DT'),
  ('ADP', 'IN'),
  ('PRON', 'PRP$'),
  ('NOUN', 'NNS'),
  ('ADP', 'IN'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('VERB', 'VBD'),
  ('ADV', 'RB'),
  ('ADV', 'RB'),
  ('ADV', 'RB'),
  ('PUNCT', ','),
  ('CCONJ', 'CC'),
  ('ADV', 'RB'),
  ('ADV', 'RB'),
  ('PUNCT', ','),
  ('ADP', 'IN'),
  ('PRON', 'DT'),
  ('PRON', 'WDT'),
  ('PRON', 'PRP'),
  ('VERB', 'VBP'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNPS'),
  ('PUNCT', '.'),
  ('PRON', 'PRP'),
  ('AUX', 'VBD'),
  ('PART', 'RB'),
  ('VERB', 'VBN'),
  ('PROPN', 'NNP'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NNS'),
  ('CCONJ', 'CC'),
  ('VERB', 'VBD'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('DET', 

In [28]:
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

final_paper_df['Proper_Nouns'] = final_paper_df['Doc'].apply(extract_proper_nouns)

list(final_paper_df.loc[[0, 2], 'Proper_Nouns'])

[['GABLES',
  'Arthur',
  'Conan',
  'Doyle',
  'Mr.',
  'Sherlock',
  'Holmes',
  'Three',
  'Gables',
  'Holmes',
  "gen'l'men",
  'Masser',
  'Holmes',
  'Holmes',
  'Masser',
  'Holmes',
  'Masser',
  'Holmes',
  'Holmes',
  'Masser',
  'Holmes',
  'Holmes',
  'Harrow',
  'Holmes',
  'Steve',
  'Dixie',
  'Masser',
  'Holmes',
  'Holmes',
  'Perkins',
  'Holborn',
  'ere',
  'Perkins',
  'Masser',
  'Holmes',
  'trainin',
  'Bull',
  'Ring',
  'Birmingham',
  'Steve',
  'Holmes',
  'Barney',
  'Lord',
  'Masser',
  'Good',
  'Masser',
  'Holmes',
  'Masser',
  'Holmes',
  "gen'l'man",
  'Masser',
  'Holmes',
  'Steve',
  'Mr.',
  'Holmes',
  'Harrow',
  'Holmes',
  'Watson',
  'Spencer',
  'John',
  'Barney',
  'Harrow',
  'Weald',
  'Mrs.',
  'Maberley',
  'Mr.',
  'Sherlock',
  'Holmes',
  'Weald',
  'Station',
  'Mortimer',
  'Maberley',
  'Mary',
  'Maberley',
  'Three',
  'Gables',
  'Harrow',
  'Weald',
  'Holmes',
  'Watson',
  'madam',
  'Holmes',
  'Douglas',
  'Holmes',
 

In [29]:
doc = nlp("This is 'an' example? sentence")

# Print counts of each part of speech in sentence
print(doc.count_by(spacy.attrs.POS))

{95: 1, 87: 1, 97: 3, 90: 1, 92: 2}


In [30]:
num_list = []

# Create new DataFrame for analysis purposes
pos_analysis_df = final_paper_df[['Filename','discipline', 'Doc']]

def get_pos_tags(doc):
    dictionary = {}
    num_pos = doc.count_by(spacy.attrs.POS)
    for k,v in sorted(num_pos.items()):
        dictionary[doc.vocab[k].text] = v
    num_list.append(dictionary)

pos_analysis_df['C_POS'] = pos_analysis_df['Doc'].apply(get_pos_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_analysis_df['C_POS'] = pos_analysis_df['Doc'].apply(get_pos_tags)


In [31]:
pos_counts = pd.DataFrame(num_list)
columns = list(pos_counts.columns)
idx = 0
new_col = pos_analysis_df['discipline']
pos_counts.insert(loc=idx, column='discipline', value=new_col)
pos_counts.head()

Unnamed: 0,discipline,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,VERB,X
0,literature,380,594,325,550,173,543,58,898,49,189,1155,321,1518,188,849,2.0
1,literature,413,643,346,520,202,578,53,1017,66,159,1052,347,1331,185,812,
2,literature,465,653,356,575,181,672,63,1047,77,190,1045,242,1486,237,850,


In [43]:
# drop the column of "Doc" because they already in "Text"
final_paper_df.to_csv('SHERLOCK1-3_with_spaCy_tags.csv')
df = pd.read_csv('SHERLOCK1-3_with_spaCy_tags.csv')
df = df.drop('Doc', axis=1)
df.to_csv('SHERLOCK1-3_with_spaCy_tags.csv', index=False)


In [44]:
files.download('SHERLOCK1-3_with_spaCy_tags.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>