In [10]:
import re
import pickle
import os
import string
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [11]:
#Creating a list of document ids
doc_no=[]
#Creating a list of words in the documents
words=[]
#Creating a list of words in the document zones i.e headings
zone_words=[]

#Stores the document id and it's corresponding zone i.e heading
zone={}

#Stores the document id and corresponding tokenised words of the document
tokenised={}

#Stores the document id and corresponding tokenised words of the document zone
zone_tokenised={}


#Opening the corpus and reading the file
f=open('./Text_corpus/wiki_00', 'r' , encoding='utf8')
content = f.read()
content=str(content)

#Removing <a>...</a> tags
pattern = re.compile("<(/)?a[^>]*>")
content_new = re.sub(pattern,"", content)

#Creating a folder to hold the seperated documents
if not os.path.exists("./Documents") :
    os.mkdir ("./Documents")

#Creating a soup using a html parser and iterating through each 'doc'
soup=BeautifulSoup(content_new,'html.parser')
for doc in soup.findAll('doc'):
    #Opening a file to write the contents of the doc
    o=open('./Documents/'+str(doc['id'])+".txt",'w', encoding='utf8')

    #Adding the document id to doc_no and extracting the text in that doc
    doc_no=doc_no+[(int(doc['id']))]
    text=doc.get_text()

    #Writing the text and closing the file
    o.write(doc.get_text())
    o.close()

    #Storing the heading of the document in the dictionary called 'zone'
    zone[int(doc['id'])]=str(text).partition('\n\n')[0][1:]

    #Extracting the heading of the document
    zone_text=zone[int(doc['id'])]

    #Making all the text lowercase
    text=text.lower()
    zone_text=zone_text.lower()

    #Replaces punctuations with spaces
    text=text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    zone_text=zone_text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

    #Removes weird punctuations. Add a sapce and symbol you want to replace respectively
    text=text.translate(str.maketrans("‘’’–——−",'       '))
    zone_text=zone_text.translate(str.maketrans("‘’’–——−",'       '))

    #Tokeinzing word from the doc and adding it to 'words' dictionary 
    words=words+word_tokenize(text)
    zone_words=zone_words+word_tokenize(zone_text)

    #Adding the token stream to a dictionary indexed by doc_id
    tokenised[int(doc['id'])]=word_tokenize(text)
    zone_tokenised[int(doc['id'])]=word_tokenize(zone_text)
    
    #Eliminating the duplicate words
    words=list(set(words))
    zone_words=list(set(zone_words))

    #Printing progress of processing documents
    print("Progress: Document_id = "+doc['id']+" : "+ zone[int(doc['id'])])
f.close()

zone_file = open('./Storage/zone.pkl', 'wb') 
pickle.dump(zone, zone_file) 
zone_file.close()

Progress: Document_id = 12 : Anarchism
Progress: Document_id = 25 : Autism
Progress: Document_id = 39 : Albedo
Progress: Document_id = 290 : A
Progress: Document_id = 303 : Alabama
Progress: Document_id = 305 : Achilles
Progress: Document_id = 307 : Abraham Lincoln
Progress: Document_id = 308 : Aristotle
Progress: Document_id = 309 : An American in Paris
Progress: Document_id = 316 : Academy Award for Best Production Design
Progress: Document_id = 324 : Academy Awards
Progress: Document_id = 330 : Actrius
Progress: Document_id = 332 : Animalia (book)
Progress: Document_id = 334 : International Atomic Time
Progress: Document_id = 336 : Altruism
Progress: Document_id = 339 : Ayn Rand
Progress: Document_id = 340 : Alain Connes
Progress: Document_id = 344 : Allan Dwan
Progress: Document_id = 358 : Algeria
Progress: Document_id = 359 : List of Atlas Shrugged characters
Progress: Document_id = 569 : Anthropology
Progress: Document_id = 572 : Agricultural science
Progress: Document_id = 573 :

In [12]:
#Checking the zone_tokeinsed file
print(zone_tokenised)

{12: ['anarchism'], 25: ['autism'], 39: ['albedo'], 290: ['a'], 303: ['alabama'], 305: ['achilles'], 307: ['abraham', 'lincoln'], 308: ['aristotle'], 309: ['an', 'american', 'in', 'paris'], 316: ['academy', 'award', 'for', 'best', 'production', 'design'], 324: ['academy', 'awards'], 330: ['actrius'], 332: ['animalia', 'book'], 334: ['international', 'atomic', 'time'], 336: ['altruism'], 339: ['ayn', 'rand'], 340: ['alain', 'connes'], 344: ['allan', 'dwan'], 358: ['algeria'], 359: ['list', 'of', 'atlas', 'shrugged', 'characters'], 569: ['anthropology'], 572: ['agricultural', 'science'], 573: ['alchemy'], 580: ['astronomer'], 586: ['ascii'], 593: ['animation'], 594: ['apollo'], 595: ['andre', 'agassi'], 597: ['austroasiatic', 'languages'], 599: ['afroasiatic', 'languages'], 600: ['andorra'], 612: ['arithmetic', 'mean'], 615: ['american', 'football', 'conference'], 620: ['animal', 'farm'], 621: ['amphibian'], 624: ['alaska'], 627: ['agriculture'], 628: ['aldous', 'huxley'], 630: ['ada'], 

In [13]:
#Creating empty dataframe
df=pd.DataFrame(0,index=doc_no,columns=words)
zone_df=pd.DataFrame(0,index=doc_no,columns=zone_words)
df

Unnamed: 0,eulogist,diethanolamine,perfumery,unbeknownst,harrowing,ganef,bornless,stowage,accused,yemen,...,pealed,decided,subsists,hjerson,misrepresentation,tantalus,amaryllidacea,danube,bobbs,issyk
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
303,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1519,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1520,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1523,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1525,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#Populating Document-Term Frequency Table
for doc_id,tokenstream in tokenised.items():
    print("Populating Document-Term Frequency Table with doc "+str(doc_id))
    for token in tokenstream:
        df[token].loc[doc_id]+=1

#Populating Zone-Term Frequency Table
for doc_id,tokenstream in zone_tokenised.items():
    print("Populating Zone-Term Frequency Table with doc "+str(doc_id))
    for token in tokenstream:
        zone_df[token].loc[doc_id]+=1

 656
Populating Zone-Term Frequency Table with doc 657
Populating Zone-Term Frequency Table with doc 659
Populating Zone-Term Frequency Table with doc 662
Populating Zone-Term Frequency Table with doc 663
Populating Zone-Term Frequency Table with doc 664
Populating Zone-Term Frequency Table with doc 665
Populating Zone-Term Frequency Table with doc 666
Populating Zone-Term Frequency Table with doc 670
Populating Zone-Term Frequency Table with doc 673
Populating Zone-Term Frequency Table with doc 674
Populating Zone-Term Frequency Table with doc 675
Populating Zone-Term Frequency Table with doc 676
Populating Zone-Term Frequency Table with doc 677
Populating Zone-Term Frequency Table with doc 679
Populating Zone-Term Frequency Table with doc 680
Populating Zone-Term Frequency Table with doc 681
Populating Zone-Term Frequency Table with doc 682
Populating Zone-Term Frequency Table with doc 683
Populating Zone-Term Frequency Table with doc 689
Populating Zone-Term Frequency Table with doc

In [15]:
df['anarchism']
#zone_df['anarchism']

12      103
25        0
39        0
290       0
303       0
       ... 
1519      0
1520      0
1523      0
1525      0
1526      0
Name: anarchism, Length: 445, dtype: int64

In [16]:
#Constructing a dictionary cotaining the term and it's inverse document frequency. Formula: idf=log(N/tf)
inv_doc_freq={}
no_of_docs=len(doc_no)
for word in words:
    inv_doc_freq[word]=np.log10(no_of_docs/sum(df[word]>0))

In [17]:
#Testing out Document Frequency
print(inv_doc_freq['a'])
print(inv_doc_freq['ner'])
print(inv_doc_freq['has'])
print(sum(df['ner']>0))

0.040904987766263036
2.6483600109809315
0.10181734750280057
1


In [18]:
#Creating and population a dictionary containg the vector of the documents
doc_vec={}
for doc_id in doc_no:
    #Creating a vector for each document
    vec=(1+np.log10(np.array(df.loc[doc_id])))#*list(doc_freq.values())
    #Replacing all -inf values with zeros. -inf reached when we take log of 0
    vec[vec==-np.inf]=0
    #Normalizing the vector
    vec=vec/(np.sqrt(sum(vec**2)))
    #Storing the vector
    doc_vec[doc_id]=vec

#Creating and population a dictionary containg the vector of the documents
zone_vec={}
for doc_id in doc_no:
    #Creating a vector for each document
    vec=(1+np.log10(np.array(zone_df.loc[doc_id])))#*list(doc_freq.values())
    #Replacing all -inf values with zeros. -inf reached when we take log of 0
    vec[vec==-np.inf]=0
    #Normalizing the vector
    vec=vec/(np.sqrt(sum(vec**2)))
    #Storing the vector
    doc_vec[doc_id]=vec
#print(doc_vec)

In [19]:
#Storing the dictionaries in pickle files
if not os.path.exists("./Storage") :
    os.mkdir ("./Storage")

doc_vec_file = open('./Storage/doc_vec.pkl', 'wb') 
pickle.dump(doc_vec, doc_vec_file) 
doc_vec_file.close()

zone_vec_file = open('./Storage/zone_vec.pkl', 'wb') 
pickle.dump(zone_vec, zone_vec_file) 
zone_vec_file.close()

inv_doc_freq_file = open('./Storage/inv_doc_freq.pkl', 'wb') 
pickle.dump(inv_doc_freq, inv_doc_freq_file) 
inv_doc_freq_file.close()

doc_no_file = open('./Storage/doc_no.pkl', 'wb') 
pickle.dump(doc_no, doc_no_file) 
doc_no_file.close()

words_file = open('./Storage/words.pkl', 'wb') 
pickle.dump(words, words_file) 
words_file.close()

zone_words_file = open('./Storage/zone_words.pkl', 'wb') 
pickle.dump(zone_words, zone_words_file) 
zone_words_file.close()

zone_file = open('./Storage/zone.pkl', 'wb') 
pickle.dump(zone, zone_file) 
zone_file.close()

tokeinsed_file = open('./Storage/tokeinsed.pkl', 'wb') 
pickle.dump(tokenised, tokeinsed_file) 
tokeinsed_file.close()

zone_tokeinsed_file = open('./Storage/zone_tokeinsed.pkl', 'wb') 
pickle.dump(zone_tokenised, zone_tokeinsed_file) 
zone_tokeinsed_file.close()

df.to_pickle('./Storage/df.pkl','bz2')
zone_df.to_pickle('./Storage/zone_df.pkl','bz2')

In [50]:
print(len(words))

61903
