# Assignment 2: How many words in BERT's Vocabulary

Carolina Chavez Ruelas

In [1]:
import re 
import pandas as pd

In [2]:
# Open the file
myfile = open('BERT-vocab.txt', 'r')
dta_bert = myfile.read()
list_bert = dta_bert.split('\n')
df_bert = pd.DataFrame(list_bert, columns=['tokens'])
print("Number of tokens in the vocabulary: ",len(list_bert))


Number of tokens in the vocabulary:  30523


### Not words

In [3]:
# Preview
df_bert.head()

Unnamed: 0,tokens
0,[PAD]
1,[unused0]
2,[unused1]
3,[unused2]
4,[unused3]


#### Reserved entries

In [4]:
# Count the reserved entries
df_bert[df_bert['tokens'].str.match('^\[')==True]


Unnamed: 0,tokens
0,[PAD]
1,[unused0]
2,[unused1]
3,[unused2]
4,[unused3]
...,...
995,[unused990]
996,[unused991]
997,[unused992]
998,[unused993]


In [5]:
df_bert[df_bert['tokens'].str.match('^\]')==True]

Unnamed: 0,tokens
1033,]


#### Single characters and punctuation

In [6]:
df_bert[df_bert['tokens'].str.match('^.$')==True]

Unnamed: 0,tokens
999,!
1000,""""
1001,#
1002,$
1003,%
...,...
1991,．
1992,／
1993,：
1994,？


#### Subwords

In [7]:
df_bert[df_bert['tokens'].str.match('^##')==True]

Unnamed: 0,tokens
2015,##s
2050,##a
2063,##e
2072,##i
2075,##ing
...,...
30517,##．
30518,##／
30519,##：
30520,##？


#### Numbers

In [8]:
df_bert[df_bert['tokens'].str.match('[0-9]+')==True]

Unnamed: 0,tokens
1014,0
1015,1
1016,2
1017,3
1018,4
...,...
29478,1682
29541,555
29562,334
29567,329


### Remove all cases from the vocabulary

In [9]:
df_bert2 = df_bert[(df_bert['tokens'].str.match('^\[')!=True) &
                   (df_bert['tokens'].str.match('^\]')!=True) &
                   (df_bert['tokens'].str.match('^.$')!=True) &
                   (df_bert['tokens'].str.match('^##')!=True) &
                   (df_bert['tokens'].str.match('[0-9]+')!=True)]
                   

### Morphology

In [10]:
# Replace for each regular verb:
# look, want, use, ask, work, seem, and, call.
df_bert2[df_bert2['tokens'].str.match('call')==True]

Unnamed: 0,tokens
2170,called
2655,call
4214,calling
4455,calls
15229,callum
20072,callie
20587,caller
25668,callahan


### Dropping the regular verbs with the inflections: ‘ed’, ‘ing’, ‘s’, ‘ful’, ‘less’, and ‘ingly’!

### (a) The case of 'ed'

In [11]:
# We can see many regular verbs that finish with 'ed'
df_bert2[df_bert2['tokens'].str.match('[a-z]+ed$')==True]

Unnamed: 0,tokens
2109,used
2142,united
2170,called
2207,released
2209,played
...,...
29561,inflated
29579,bobbed
29580,dismounted
29592,infused


In [12]:
# Words that finish with 'ed'
words_w_ed = df_bert2[df_bert2['tokens'].str.match('[a-z]+ed$')==True]

# Removing the 'ed'
words_noed_check = list(words_w_ed['tokens'].replace('ed$','', regex=True)) 

In [13]:
list_infl_ed = list()
for i in range(len(words_noed_check)):
    words_noed_check[i]
    test0 = words_noed_check[i]
    
    # Check if word without 'ed' exists in the BERT tokens
    status = (df_bert2['tokens'].eq(test0)).any()
    
    # Create a record for the word and the result of the search
    temp_list = [words_noed_check[i], status]
    
    list_infl_ed.append(temp_list)





In [14]:
df_infl_ed = pd.DataFrame(list_infl_ed, columns=['word_no_ed', 'in_df'])
df_infl_ed.head

<bound method NDFrame.head of       word_no_ed  in_df
0             us   True
1           unit   True
2           call   True
3         releas  False
4           play   True
...          ...    ...
1986      inflat  False
1987        bobb  False
1988    dismount  False
1989       infus  False
1990  necessitat  False

[1991 rows x 2 columns]>

In [15]:
# Total count of words without 'ed' found in the BERT tokens
# 'True' if found
df_infl_ed['in_df'].value_counts()

False    1102
True      889
Name: in_df, dtype: int64

### (b) The case of 'ing'

In [16]:
# We can see many regular verbs that finish with 'ing'
df_bert2[df_bert2['tokens'].str.match('[a-z]+ing$')==True]

Unnamed: 0,tokens
2076,during
2108,being
2164,including
2183,going
2206,following
...,...
29479,yearning
29494,accelerating
29508,inspecting
29571,flourishing


In [17]:
# Words that finish with 'ing'
words_w_ing = df_bert2[df_bert2['tokens'].str.match('[a-z]+ing$')==True]

# Removing the 'ing'
words_noing_check = list(words_w_ing['tokens'].replace('ing$','', regex=True)) 

In [18]:
list_infl_ing = list()
for i in range(len(words_noing_check)):
    words_noing_check[i]
    test0 = words_noing_check[i]
    
    # Check if word without 'ing' exists in the BERT tokens
    status = (df_bert2['tokens'].eq(test0)).any()
    
    # Create a record for the word and the result of the search
    temp_list = [words_noing_check[i], status]
    
    list_infl_ing.append(temp_list)

    



In [19]:
df_infl_ing = pd.DataFrame(list_infl_ing, columns=['word_no_ing', 'in_df'])
df_infl_ing.head

<bound method NDFrame.head of      word_no_ing  in_df
0            dur  False
1             be   True
2         includ  False
3             go   True
4         follow   True
...          ...    ...
1330       yearn  False
1331   accelerat  False
1332     inspect   True
1333    flourish   True
1334        pudd  False

[1335 rows x 2 columns]>

In [20]:
# Total count of words without 'ing' found in the BERT tokens
# 'True' if found
df_infl_ing['in_df'].value_counts()

True     741
False    594
Name: in_df, dtype: int64

### (c) The case of 's'

In [21]:
# We can see many regular verbs that finish with 's'
df_bert2[df_bert2['tokens'].str.match('[a-z]+s$')==True]

Unnamed: 0,tokens
2001,was
2003,is
2004,as
2010,his
2023,this
...,...
29584,tbs
29589,inspections
29594,stalks
29597,leases


In [22]:
# Words that finish with 's'
words_w_s = df_bert2[df_bert2['tokens'].str.match('[a-z]+s$')==True]

# Removing the 'ing'
words_nos_check = list(words_w_s['tokens'].replace('s$','', regex=True)) 

In [23]:
list_infl_s = list()
for i in range(len(words_nos_check)):
    words_nos_check[i]
    test0 = words_nos_check[i]
    
    # Check if word without 's' exists in the BERT tokens
    status = (df_bert2['tokens'].eq(test0)).any()
    
    # Create a record for the word and the result of the search
    temp_list = [words_nos_check[i], status]
    
    list_infl_s.append(temp_list)

    


In [24]:
df_infl_s = pd.DataFrame(list_infl_s, columns=['word_no_s', 'in_f'])
df_infl_s.head

<bound method NDFrame.head of        word_no_s   in_f
0             wa   True
1              i  False
2              a  False
3             hi   True
4            thi  False
...          ...    ...
4033          tb   True
4034  inspection   True
4035       stalk   True
4036       lease   True
4037     scandal   True

[4038 rows x 2 columns]>

In [25]:
# Total count of words without 's' found in the BERT tokens
# 'True' if found
df_infl_s['in_f'].value_counts()

True     2975
False    1063
Name: in_f, dtype: int64

### (d) The case of 'd'

This case was reinforced with case (a) 'ed'

In [26]:
# We can see many regular verbs that finish with 'd'
df_bert2[df_bert2['tokens'].str.match('[a-z]+d$')==True]

Unnamed: 0,tokens
1998,and
2018,had
2052,would
2056,said
2071,could
...,...
29579,bobbed
29580,dismounted
29592,infused
29610,thyroid


In [27]:
# Words that finish with 'd'
words_w_d = df_bert2[df_bert2['tokens'].str.match('[a-z]+d$')==True]

# Removing the 'ing'
words_nod_check = list(words_w_d['tokens'].replace('d$','', regex=True)) 

In [28]:
list_infl_d = list()
for i in range(len(words_nod_check)):
    words_nod_check[i]
    test0 = words_nod_check[i]
    
    # Check if word without 'd' exists in the BERT tokens
    status = (df_bert2['tokens'].eq(test0)).any()
    
    # Create a record for the word and the result of the search
    temp_list = [words_nod_check[i], status]
    
    list_infl_d.append(temp_list)


In [29]:
df_infl_d = pd.DataFrame(list_infl_d, columns=['word_no_d', 'in_f'])
df_infl_d.head

<bound method NDFrame.head of         word_no_d   in_f
0              an   True
1              ha   True
2            woul  False
3             sai   True
4            coul  False
...           ...    ...
2591        bobbe  False
2592    dismounte  False
2593       infuse  False
2594       thyroi  False
2595  necessitate  False

[2596 rows x 2 columns]>

In [30]:
# Total count of words without 'd' found in the BERT tokens
# 'True' if found
df_infl_d['in_f'].value_counts()

False    1969
True      627
Name: in_f, dtype: int64