In [1]:
import os
import numpy as np
import pandas as pd
import string
import nltk
from nltk import word_tokenize, FreqDist, SnowballStemmer, pos_tag
from nltk.corpus import stopwords

In [2]:
data = pd.read_csv('osha.txt', delimiter = "\t")

In [3]:
data.columns = ['ID', 'Title', 'Report']

In [4]:
data.head()

Unnamed: 0,ID,Title,Report
0,202561825,Employee Falls From Flatbed Trailer And Later...,On August 30 2013 Employee #1 was working f...
1,200361855,Two Workers Are Struck By Motor Vehicle And O...,On August 27 2013 Employees #1 and #2 of T...
2,200361863,Employee Is Struck By Bales Of Wire And Killed,On August 26 2013 Employee #1 with Lee Iro...
3,201079324,Employee Is Splashed With Hot Water And Is Bu...,On July 14 2013 Employee #1 vacuum pump tr...
4,202658258,Employee Suffers Burns While Moving Soup,On June 30 2013 Employee #1 was working in ...


In [5]:
tokens_t = []
for i in range(len(data.Title)):
  tokens_t.append(word_tokenize(data.Title[i]))

In [6]:
len(data)

9999

In [7]:
#title of length one
abnormal_title = [each_title for each_title in tokens_t if len(each_title) == 1] 
print(abnormal_title)

[['Fall'], ['Burn'], ['Go'], ['Non'], ['Burn'], ['Burn'], ['Explosion'], ['Explosion'], ['Fall'], ['Non'], ['Burn'], ['Burn'], ['O'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['77'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Fall'], ['Fall'], ['Co'], ['70'], ['Explosion'], ['Self'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Cave'], ['Suffocation'], ['Burn'], ['Twenty'], ['Explosion'], ['Fall'], ['Twenty'], ['Forty'], ['Burns'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn'], ['Burn']]


In [8]:
# The longest report has 804 length
# tokens
longest_report = 0
for i in range(len(data)):
  report_len = len(word_tokenize(data.Report[i]))
  if report_len > longest_report:
    longest_report = report_len
longest_report


804

In [9]:
tokens_report = []
for i in range(len(data.Report)):
  tokens_report.append(word_tokenize(data.Report[i]))

In [10]:
#report of length 3
abnormal_report = [each_report for each_report in tokens_report if len(each_report) == 3] 
print(abnormal_report[0:5])

[['InspectionOpen', 'DateSICEstablishment', 'Name'], ['InspectionOpen', 'DateSICEstablishment', 'Name'], ['InspectionOpen', 'DateSICEstablishment', 'Name'], ['InspectionOpen', 'DateSICEstablishment', 'Name'], ['InspectionOpen', 'DateSICEstablishment', 'Name']]


In [11]:
stop = stopwords.words('english')
snowball = SnowballStemmer('english')

def myPrep(tt):
    toks = nltk.word_tokenize(tt.lower())
    toks = [ t for t in toks if t not in string.punctuation]
    toks = [ t for t in toks if t not in stop ]
    toks = [ t for t in toks if not t.isnumeric() ]
    toks = [ snowball.stem(t) for t in toks ]
    return toks

In [12]:
# Preprocess each file in each category
title_clean = [myPrep(f) for f in data.Title]

# Flatten the list of lists for FreqDist; break the docs
title_flat = [ c for l in title_clean for c in l ] #flatten the lists of list into one single long list

fd_title = FreqDist(title_flat)

In [13]:
#most frequent word in title
fd_title.most_common(10)

[('employe', 7834),
 ('kill', 2139),
 ('fall', 2114),
 ('injur', 1724),
 ('struck', 1151),
 ('die', 1093),
 ('burn', 981),
 ("'s", 963),
 ('amput', 843),
 ('worker', 829)]

In [14]:
# Preprocess each file in each category
report_clean = [myPrep(f) for f in data.Report]

# Flatten the list of lists for FreqDist; break the docs
report_flat = [ c for l in report_clean for c in l ] #flatten the lists of list into one single long list

fd_report = FreqDist(report_flat)

In [15]:
#most frequent word in report
fd_report.most_common(10)

[('employe', 38339),
 ('hospit', 6640),
 ('approxim', 6114),
 ('work', 5721),
 ('cowork', 4262),
 ("'s", 4237),
 ('oper', 3974),
 ('use', 3624),
 ('truck', 3180),
 ('left', 2891)]

In [16]:
# pip install gensim

Collecting gensim
  Downloading gensim-4.0.1-cp38-cp38-macosx_10_9_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 3.9 MB/s 
[?25hCollecting smart-open>=1.8.1
  Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 3.9 MB/s 
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.0.1 smart-open-5.1.0
Note: you may need to restart the kernel to use updated packages.


<font size="3"> 

<b>1.	What are the major types of accidents reflected in the reports?
<br>
['burn_accident' 'fell_accident' 'line_accident' 'machine_accident'
 'truck_accident']
<br>
[2799 2422 1681 1468 1629]
<br>
<br>
2.	Which type of accidents has the largest number of occurrences?
<br>
'burn_accident'<b>

</font>

In [7]:
irrelavant_word =  ['approximately','left','right','hospital','hospitalized', 'coworker', 'a.m.', 'hospitalized', 'employer','work', 'working', 'day']
mystopwords=stopwords.words("english") + irrelavant_word
WNlemma = nltk.WordNetLemmatizer()

def pre_process(text):
    tokens = nltk.word_tokenize(text)
    tokens=[ WNlemma.lemmatize(t.lower()) for t in tokens]
    tokens=[ t for t in tokens if t not in mystopwords]
    tokens = [ t for t in tokens if len(t) >= 3 ]
    return(tokens)


text = data.Report
toks = text.apply(pre_process)

print(toks)

0       [august, 2013, employee, flatbed, trailer, wor...
1       [august, 2013, employee, templar, inc., constr...
2       [august, 2013, employee, lee, iron, metal, com...
3       [july, 2013, employee, vacuum, pump, truck, dr...
4       [june, 2013, employee, food, taqueria, superma...
                              ...                        
9994    [june, 2004, employee, remotely, controlling, ...
9995    [9:00, p.m., april, 2004, employee, operating,...
9996    [employee, operating, remote, controlled, rail...
9997    [september, 2002, employee, riding, fireman, e...
9998    [6:00, p.m., march, 2001, employee, contract, ...
Name: Report, Length: 9999, dtype: object


In [8]:
# Use dictionary (built from corpus) to prepare a DTM (using frequency)
import logging
#pip install gensim
import gensim 
from gensim import corpora

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# pip install python-Levenshtein

In [150]:
toks[1:5]

1    [august, 2013, employee, templar, inc., constr...
2    [august, 2013, employee, lee, iron, metal, com...
3    [july, 2013, employee, vacuum, pump, truck, dr...
4    [june, 2013, employee, food, taqueria, superma...
Name: Report, dtype: object

In [151]:
# Filter off any words with document frequency less than 2, or appearing in more than 80% documents
dictionary = corpora.Dictionary(toks)
print(dictionary)

2021-08-12 21:08:15,251 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-08-12 21:08:15,958 : INFO : built Dictionary(24429 unique tokens: ['2013', 'abdomen', 'august', 'caused', 'death']...) from 9999 documents (total 590249 corpus positions)
2021-08-12 21:08:15,959 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(24429 unique tokens: ['2013', 'abdomen', 'august', 'caused', 'death']...) from 9999 documents (total 590249 corpus positions)", 'datetime': '2021-08-12T21:08:15.958950', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 12:59:45) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


Dictionary(24429 unique tokens: ['2013', 'abdomen', 'august', 'caused', 'death']...)


In [152]:

dictionary.filter_extremes(no_below=2, no_above=0.8)

2021-08-12 21:08:18,376 : INFO : discarding 12938 tokens: [('employee', 9576), ('templar', 1), ('travel/through', 1), ('948', 1), ('co-axial', 1), ('hump', 1), ('impeded', 1), ('taqueria', 1), ('placer', 1), ('19-millimeter-thick', 1)]...
2021-08-12 21:08:18,378 : INFO : keeping 11491 tokens which were in no less than 2 and no more than 7999 (=80.0%) documents
2021-08-12 21:08:18,406 : INFO : resulting dictionary: Dictionary(11491 unique tokens: ['2013', 'abdomen', 'august', 'caused', 'death']...)


In [49]:
print(dictionary)

Dictionary(11502 unique tokens: ['2013', 'abdomen', 'august', 'caused', 'death']...)


In [153]:
#dtm here is a list of lists, which is exactly a matrix
dtm = [dictionary.doc2bow(d) for d in toks]
print(dtm[0:2])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 3), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 3), (17, 1), (18, 1)], [(0, 1), (2, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 3), (36, 1), (37, 1), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 2), (54, 1), (55, 1), (56, 2), (57, 1), (58, 1), (59, 2), (60, 1), (61, 1), (62, 2), (63, 1)]]


In [154]:
lda = gensim.models.ldamodel.LdaModel(dtm, num_topics = 5, id2word = dictionary, passes=10,chunksize=128,random_state=10)

lda.show_topics(10) #top 10 word tokens for each topics -> further seperation and increase topic no.

2021-08-12 21:08:38,932 : INFO : using symmetric alpha at 0.2
2021-08-12 21:08:38,942 : INFO : using symmetric eta at 0.2
2021-08-12 21:08:38,976 : INFO : using serial LDA version on this node
2021-08-12 21:08:39,029 : INFO : running online (multi-pass) LDA training, 5 topics, 10 passes over the supplied corpus of 9999 documents, updating model once every 128 documents, evaluating perplexity every 1280 documents, iterating 50x with a convergence threshold of 0.001000
2021-08-12 21:08:39,038 : INFO : PROGRESS: pass 0, at document #128/9999
2021-08-12 21:08:39,217 : INFO : merging changes from 128 documents into a model of 9999 documents
2021-08-12 21:08:39,253 : INFO : topic #0 (0.200): 0.011*"abrasive" + 0.009*"wheel" + 0.008*"2013" + 0.007*"machine" + 0.007*"area" + 0.007*"valve" + 0.006*"piece" + 0.006*"hand" + 0.006*"leg" + 0.006*"sustained"
2021-08-12 21:08:39,256 : INFO : topic #1 (0.200): 0.009*"abrasive" + 0.008*"injury" + 0.007*"sustained" + 0.007*"struck" + 0.007*"vehicle" + 0

[(0,
  '0.038*"machine" + 0.032*"hand" + 0.026*"finger" + 0.017*"press" + 0.012*"caught" + 0.009*"amputated" + 0.009*"number" + 0.009*"metal" + 0.008*"operating" + 0.007*"conveyor"'),
 (1,
  '0.026*"line" + 0.015*"train" + 0.014*"locomotive" + 0.013*"power" + 0.011*"ground" + 0.011*"tree" + 0.011*"lift" + 0.011*"bucket" + 0.010*"crane" + 0.010*"crew"'),
 (2,
  '0.020*"fell" + 0.016*"foot" + 0.012*"ladder" + 0.011*"floor" + 0.010*"transported" + 0.010*"injury" + 0.009*"sustained" + 0.009*"fall" + 0.008*"fracture" + 0.007*"roof"'),
 (3,
  '0.025*"truck" + 0.017*"car" + 0.010*"door" + 0.009*"driver" + 0.009*"forklift" + 0.009*"side" + 0.008*"trailer" + 0.007*"struck" + 0.007*"front" + 0.007*"number"'),
 (4,
  '0.012*"burn" + 0.011*"water" + 0.010*"tank" + 0.010*"fire" + 0.008*"hot" + 0.006*"area" + 0.006*"gas" + 0.006*"building" + 0.005*"room" + 0.005*"degree"')]

In [158]:
##Evaluate the coherence score of LDA models
'''
u_mass:prefer the model close to 0 
c_v: [0,1], prefer bigger value   
Do not fully rely on the coherence score
'''
from gensim.models.coherencemodel import CoherenceModel
cm_umass = CoherenceModel(lda,  dictionary=dictionary, corpus=dtm, coherence='u_mass')
cm_cv = CoherenceModel(lda,  dictionary=dictionary, texts=toks, coherence='c_v')
lda_umass = cm_umass.get_coherence()
lda_cv = cm_cv.get_coherence()
print(lda_umass)
print(lda_cv)

2021-08-12 21:40:29,581 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2021-08-12 21:40:29,599 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2021-08-12 21:40:29,616 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2021-08-12 21:40:29,637 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2021-08-12 21:40:29,654 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2021-08-12 21:40:29,675 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2021-08-12 21:40:29,733 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2021-08-12 21:40:29,757 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2021-08-12 21:40:29,779 : INFO : CorpusAccumulator accumulated stats from 9000 documents
2021-08-12 21:40:29,827 : INFO : using ParallelWordOccurrenceAccumulator(processes=3, batch_size=64) to estimate probabilities from sliding windows
2021-08-12 21:40:41,261 : INFO : 3 accumulators ret

-2.2677975351347284
0.4774837353146696


[(0,
  '0.038*"machine" + 0.032*"hand" + 0.026*"finger" + 0.017*"press" + 0.012*"caught" + 0.009*"amputated" + 0.009*"number" + 0.009*"metal" + 0.008*"operating" + 0.007*"conveyor"'),
  <br>

 (1,
  '0.026*"line" + 0.015*"train" + 0.014*"locomotive" + 0.013*"power" + 0.011*"ground" + 0.011*"tree" + 0.011*"lift" + 0.011*"bucket" + 0.010*"crane" + 0.010*"crew"'),
  <br>

 (2,
  '0.020*"fell" + 0.016*"foot" + 0.012*"ladder" + 0.011*"floor" + 0.010*"transported" + 0.010*"injury" + 0.009*"sustained" + 0.009*"fall" + 0.008*"fracture" + 0.007*"roof"'),
  <br>

 (3,
  '0.025*"truck" + 0.017*"car" + 0.010*"door" + 0.009*"driver" + 0.009*"forklift" + 0.009*"side" + 0.008*"trailer" + 0.007*"struck" + 0.007*"front" + 0.007*"number"'),
  <br>

 (4,
  '0.012*"burn" + 0.011*"water" + 0.010*"tank" + 0.010*"fire" + 0.008*"hot" + 0.006*"area" + 0.006*"gas" + 0.006*"building" + 0.005*"room" + 0.005*"degree"')]

In [159]:
dict = {0: 'machine_accident', 1: 'line_accident', 2: 'fell_accident', 3: 'truck_accident', 4:'burn_accident'}

In [160]:
# Get the topic distribution of documents
doc_topics = lda.get_document_topics(dtm)

In [161]:
from operator import itemgetter
#show the topic distributions for the first 5 docs, #hard and soft classification
for i in range(0, 5):
    print(doc_topics[i])
    print(max(doc_topics[i], key=itemgetter(1))[0]) 

[(2, 0.5784922), (3, 0.31404883), (4, 0.09117104)]
2
[(1, 0.103120945), (2, 0.25486925), (3, 0.6348356)]
3
[(0, 0.012362811), (1, 0.12007058), (2, 0.32655978), (3, 0.38380855), (4, 0.15719824)]
3
[(0, 0.028271794), (1, 0.0471907), (3, 0.23253421), (4, 0.6904063)]
4
[(0, 0.21200877), (2, 0.13087101), (3, 0.039739784), (4, 0.6117228)]
4


In [162]:
#Select the best topic (with highest score) for each document
top_topic = [ max(t, key=itemgetter(1))[0] for t in doc_topics ]
print (top_topic[0:20])

[2, 3, 3, 4, 4, 0, 3, 3, 2, 3, 2, 4, 0, 1, 4, 1, 3, 2, 2, 2]


In [163]:
topics_perDoc = [ dict[t] for t in top_topic ]
print (topics_perDoc[0:20])

['fell_accident', 'truck_accident', 'truck_accident', 'burn_accident', 'burn_accident', 'machine_accident', 'truck_accident', 'truck_accident', 'fell_accident', 'truck_accident', 'fell_accident', 'burn_accident', 'machine_accident', 'line_accident', 'burn_accident', 'line_accident', 'truck_accident', 'fell_accident', 'fell_accident', 'fell_accident']


In [164]:
# How many dos in each topic?
labels, counts = np.unique(topics_perDoc, return_counts=True)
print (labels)
print (counts)

['burn_accident' 'fell_accident' 'line_accident' 'machine_accident'
 'truck_accident']
[2799 2422 1681 1468 1629]


In [None]:
###########################Save and load pre-trained model
from gensim.test.utils import datapath
# Save model to disk.
temp_file = datapath("LDA_model")
lda.save(temp_file)

In [None]:
# Load a potentially pretrained model from disk.
lda = gensim.models.ldamodel.LdaModel.load(temp_file)

<font size="3"> 
<b> 4. hand is injured most with 1712 document frequency <b>

In [9]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/bailan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bailan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/bailan/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/bailan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
# The input for POS tagger needs to be tokenized first.

lemma_list = []

# A more simplified tagset - universal
#https://universaldependencies.org/u/pos/all.html
for each_list in data.Report:
  sent_pos2 = pos_tag(word_tokenize(each_list), tagset='universal')
  lemma_list.append(sent_pos2)

In [88]:
lemma_list[0]

[('On', 'ADP'),
 ('August', 'NOUN'),
 ('30', 'NUM'),
 ('2013', 'NUM'),
 ('Employee', 'NOUN'),
 ('#', '.'),
 ('1', 'NUM'),
 ('was', 'VERB'),
 ('working', 'VERB'),
 ('from', 'ADP'),
 ('a', 'DET'),
 ('flatbed', 'ADJ'),
 ('trailer', 'NOUN'),
 ('.', '.'),
 ('As', 'ADP'),
 ('he', 'PRON'),
 ('worked', 'VERB'),
 ('he', 'PRON'),
 ('fell', 'VERB'),
 ('from', 'ADP'),
 ('the', 'DET'),
 ('flatbed', 'ADJ'),
 ('trailer', 'NOUN'),
 ('onto', 'ADP'),
 ('the', 'DET'),
 ('ground', 'NOUN'),
 ('striking', 'VERB'),
 ('his', 'PRON'),
 ('abdomen', 'NOUN'),
 ('.', '.'),
 ('The', 'DET'),
 ('fall', 'NOUN'),
 ('height', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('flatbed', 'ADJ'),
 ('trailer', 'NOUN'),
 ('was', 'VERB'),
 ('57', 'NUM'),
 ('inches', 'NOUN'),
 ('.', '.'),
 ('Employee', 'NOUN'),
 ('#', '.'),
 ('1', 'NUM'),
 ('sustained', 'VERB'),
 ('unspecified', 'ADJ'),
 ('injuries', 'NOUN'),
 ('in', 'ADP'),
 ('the', 'DET'),
 ('fall', 'NOUN'),
 ('that', 'DET'),
 ('later', 'ADV'),
 ('on', 'ADP'),
 ('caused', 'VERB'),

In [86]:
lemma_list[0][0][0]

'On'

In [11]:
lemma_list[0][0][1]

'ADP'

In [12]:
# POS information can be very helpful.
# For example, the wordnet lemmatizer works properly with the pos given
wnl = nltk.WordNetLemmatizer()
# wnl.lemmatize('born', pos = 'v')

In [13]:
# if we want to get the lemmas of the content words in the text
#input: tuple
lemmas_list_noun = []
each_list_token_noun = []
for each_list_token in lemma_list:
  for each_token in each_list_token:
    if each_token[1] == 'NOUN':
        each_list_token_noun.append(wnl.lemmatize(each_token[0].lower()))
  lemmas_list_noun.append(each_list_token_noun)
  each_list_token_noun = []
  # s += 1
  # if s >2:
  #     break

In [14]:
#2.	Get document frequency : turn the lists into sets: set(list_1) , the FreqDist
# s = 0
lemmas_list_set = []
for each_list in lemmas_list_noun:
  new_set = set(each_list)
  lemmas_list_set.append(new_set)
  # if s > 2:
  #   break

In [105]:
len(lemmas_list_set)

9999

In [15]:
# Flatten the list of lists for FreqDist; break the docs
lemmas_flat = [ c for l in lemmas_list_set for c in l ] #flatten the lists of list into one single long list

fd_lemmas = FreqDist(lemmas_flat)

In [17]:
fd_lemmas.most_common(30)

[('employee', 9575),
 ('hospital', 2639),
 ('injury', 2042),
 ('p.m.', 1803),
 ('hand', 1712),
 ('foot', 1624),
 ('a.m.', 1578),
 ('area', 1566),
 ('day', 1566),
 ('coworker', 1542),
 ('ground', 1379),
 ('side', 1274),
 ('center', 1264),
 ('line', 1218),
 ('truck', 1193),
 ('accident', 1161),
 ('burn', 1153),
 ('time', 1116),
 ('head', 1108),
 ('work', 1103),
 ('floor', 1102),
 ('emergency', 1087),
 ('machine', 1079),
 ('employer', 1068),
 ('ft', 1052),
 ('company', 1036),
 ('finger', 1029),
 ('service', 1003),
 ('treatment', 980),
 ('operator', 980)]

In [18]:

#3.	Get the list of body terms from wordnet
# To look up a word
from nltk.corpus import wordnet as wn

In [109]:
wn.synsets('body')
print(wn.synsets('body'))

[Synset('body.n.01'), Synset('body.n.02'), Synset('body.n.03'), Synset('body.n.04'), Synset('torso.n.01'), Synset('body.n.06'), Synset('consistency.n.01'), Synset('body.n.08'), Synset('body.n.09'), Synset('soundbox.n.01'), Synset('body.n.11'), Synset('body.v.01')]


In [111]:
# Look up with specified POS - NOUN, VERB, ADJ, ADV
print (wn.synsets('body', pos = wn.NOUN))

[Synset('body.n.01'), Synset('body.n.02'), Synset('body.n.03'), Synset('body.n.04'), Synset('torso.n.01'), Synset('body.n.06'), Synset('consistency.n.01'), Synset('body.n.08'), Synset('body.n.09'), Synset('soundbox.n.01'), Synset('body.n.11')]


In [119]:
# Let's examine a synset in more details: its definition, examples, lemma
ss = wn.synsets('body_part', pos = wn.NOUN)[0]
print(ss.definition())
print(ss.examples())

any part of an organism such as an organ or extremity
[]


In [19]:
# Let's examine a synset in more details: its definition, examples, lemma
oc = wn.synsets('occupation', pos = wn.NOUN)[0]
print(oc.definition())
print(oc.examples())

the principal activity in your life that you do to earn money
["he's not in my line of business"]


In [20]:
oc.hyponyms()

[Synset('accountancy.n.01'),
 Synset('appointment.n.05'),
 Synset('career.n.01'),
 Synset('catering.n.01'),
 Synset('confectionery.n.03'),
 Synset('employment.n.02'),
 Synset('farming.n.02'),
 Synset('game.n.10'),
 Synset('metier.n.02'),
 Synset('photography.n.03'),
 Synset('position.n.06'),
 Synset('profession.n.02'),
 Synset('sport.n.02'),
 Synset('trade.n.02'),
 Synset('treadmill.n.03')]

In [120]:
ss.hyponyms()

[Synset('abdomen.n.01'),
 Synset('adnexa.n.01'),
 Synset('ambulacrum.n.01'),
 Synset('ampulla.n.01'),
 Synset('apparatus.n.02'),
 Synset('area.n.03'),
 Synset('back.n.01'),
 Synset('buttock.n.01'),
 Synset('buttocks.n.01'),
 Synset('cannon.n.05'),
 Synset('dilator.n.01'),
 Synset('dock.n.06'),
 Synset('dorsum.n.02'),
 Synset('energid.n.01'),
 Synset('external_body_part.n.01'),
 Synset('feature.n.02'),
 Synset('flank.n.04'),
 Synset('fornix.n.01'),
 Synset('gaskin.n.01'),
 Synset('groove.n.03'),
 Synset('haunch.n.01'),
 Synset('hindquarters.n.02'),
 Synset('hip.n.01'),
 Synset('horseback.n.01'),
 Synset('joint.n.01'),
 Synset('lobe.n.01'),
 Synset('loin.n.02'),
 Synset('loins.n.02'),
 Synset('mentum.n.03'),
 Synset('organ.n.01'),
 Synset('partition.n.03'),
 Synset('process.n.05'),
 Synset('rectum.n.01'),
 Synset('rudiment.n.02'),
 Synset('saddle.n.06'),
 Synset('shank.n.02'),
 Synset('shin.n.01'),
 Synset('shoulder.n.01'),
 Synset('small.n.01'),
 Synset('structure.n.04'),
 Synset('stump

In [121]:
ss.hyponyms()[0].lemma_names()

['abdomen', 'venter', 'stomach', 'belly']

In [23]:
oc.hyponyms()[0].lemma_names()

['accountancy', 'accounting']

In [122]:
#get all children and grandchildren words
hyps = list(set(
                [w for s in ss.closure(lambda s:s.hyponyms())
                        for w in s.lemma_names()]))
sorted(hyps)

  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):


['ANS',
 'Achilles_tendon',
 "Adam's_apple",
 "Bartholin's_gland",
 "Bowman's_capsule",
 "Broca's_area",
 "Broca's_center",
 "Broca's_convolution",
 "Broca's_gyrus",
 "Brodmann's_area",
 "Brodmann's_area_17",
 'CNS',
 "Cowper's_gland",
 'Eustachian_tube',
 'Fallopian_tube',
 'GI_tract',
 'Golgi_apparatus',
 'Golgi_body',
 'Golgi_complex',
 'Graafian_follicle',
 'Haversian_canal',
 "Luschka's_tonsil",
 'MPS',
 'Meibomian_gland',
 "Montgomery's_tubercle",
 'OD',
 'OS',
 "Peter's_gland",
 "Peyer's_patch",
 "Purkinje's_system",
 "Purkinje's_tissue",
 'Purkinje_fiber',
 'Purkinje_network',
 'RES',
 "Rolando's_area",
 "Rolando's_fissure",
 'Roman_nose',
 'SA_node',
 "Schlemm's_canal",
 'Sylvian_aqueduct',
 'Sylvian_fissure',
 "Wernicke's_area",
 "Wernicke's_center",
 'Wormian_bone',
 'ab',
 'abdomen',
 'abdominal',
 'abdominal_aorta',
 'abdominal_cavity',
 'abdominal_external_oblique_muscle',
 'abdominal_muscle',
 'abdominal_nerve_plexus',
 'abducens',
 'abducens_muscle',
 'abducens_nerve',


In [31]:
#get all children and grandchildren words
oc_hyps = list(set(
                [w for s in oc.closure(lambda s:s.hyponyms())
                        for w in s.lemma_names()]))
sorted(oc_hyps)

['Agriculture_Secretary',
 'Attorney_General',
 'Attorney_General_of_the_United_States',
 'Chief_Executive',
 'Commerce_Secretary',
 'Defense_Secretary',
 'Education_Secretary',
 'Energy_Secretary',
 'FIFO',
 'French_lesson',
 'German_lesson',
 'Hebrew_lesson',
 'Interior_Secretary',
 'LIFO',
 'Labor_Secretary',
 'Navy_Secretary',
 'President',
 'President_of_the_United_States',
 'Secretary_of_Agriculture',
 'Secretary_of_Commerce',
 'Secretary_of_Commerce_and_Labor',
 'Secretary_of_Defense',
 'Secretary_of_Education',
 'Secretary_of_Energy',
 'Secretary_of_Health_Education_and_Welfare',
 'Secretary_of_Health_and_Human_Services',
 'Secretary_of_Housing_and_Urban_Development',
 'Secretary_of_Labor',
 'Secretary_of_State',
 'Secretary_of_Transportation',
 'Secretary_of_Veterans_Affairs',
 'Secretary_of_War',
 'Secretary_of_the_Interior',
 'Secretary_of_the_Navy',
 'Secretary_of_the_Treasury',
 'Transportation_Secretary',
 'Treasury_Secretary',
 'War_Secretary',
 'academicianship',
 'acco

In [25]:
len(hyps)

382

In [26]:
fd_lemmas.most_common(5)

[('employee', 9575),
 ('hospital', 2639),
 ('injury', 2042),
 ('p.m.', 1803),
 ('hand', 1712)]

In [146]:
print(fd_lemmas['hand'])
print(fd_lemmas['foot'])
print(fd_lemmas['area'])

1712
1624
1566


In [136]:
body_part_set = set(hyps)

In [32]:
occupation_set = set(oc_hyps)

In [147]:
#hand is injured most with 1712 document frequency
for each in fd_lemmas:
  if each in body_part_set:
    print(each)
    break


hand


In [30]:
#engineering is injured most 
target = 0
for each in fd_lemmas:
  if each in occupation_set:
    print(each)
    target += 1
    if target > 8:
      break
    


work
service
position
place
office
post
engineering
instruction
roofing
