In [None]:
import re
import pickle
import os
import string
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [None]:
#Creating a list of document ids
doc_no=[]
#Creating a list of words in the documents
words=[]

diction={}

#Opening the corpus and reading the file
f=open('./Text_corpus/wiki_00', 'r' , encoding='utf8')
content = f.read()
content=str(content)

#Removing <a>...</a> tags
pattern = re.compile("<(/)?a[^>]*>")
content_new = re.sub(pattern,"", content)

#Creating a folder to hold the seperated documents
if not os.path.exists("./Documents") :
    os.mkdir ("./Documents")

#Creating a soup using a html parser and iterating through each 'doc'
soup=BeautifulSoup(content_new,'html.parser')
for doc in soup.findAll('doc'):
    #Opening a file to write the contents of the doc
    o=open('./Documents/'+str(doc['id'])+".txt",'w', encoding='utf8')
    doc_no=doc_no+[(int(doc['id']))]
    text=doc.get_text()

    #Making all the text lowercase
    text=text.lower()

    #Replaces punctuations with spaces
    text=text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    #Removes weird punctuations. Add a sapce and symbol you want to replace respectively
    text=text.translate(str.maketrans("‘’’–——−",'       '))

    #Tokeinzing word from the doc and adding it to 'words' dictionary 
    words=words+word_tokenize(text)

    #Adding the token stream to a dictionary indexed by doc_id
    diction[int(doc['id'])]=word_tokenize(text)
    
    #Eliminating the duplicate words
    words=list(set(words))

    #Writing the text and closing the file
    o.write(doc.get_text())
    o.close()
f.close()

In [None]:
#Creating empty dataframe
df=pd.DataFrame(0,index=doc_no,columns=words)
df


In [None]:
#Populating Document-Term Frequency Table
for doc_id,tokenstream in diction.items():
    print("Populating Document-Term Frequency Table with doc "+str(doc_id))
    for token in tokenstream:
        df[token].loc[doc_id]+=1

In [None]:
df['anarchism']

In [None]:
#Creating Document Frequency dictionary
doc_freq={}
no_of_docs=len(doc_no)
for word in words:
    doc_freq[word]=np.log10(no_of_docs/sum(df[word]>0))

In [None]:
#Testing out Document Frequency
print(doc_freq['a'])
print(doc_freq['ner'])
print(sum(df['ner']>0))

In [None]:
#Creating and population a dictionary containg the vector of the documents
doc_vec={}
for doc_id in doc_no:
    #Creating a vector for each document
    vec=(1+np.log10(np.array(df.loc[doc_id])))*list(doc_freq.values())
    #Replacing all -inf values with zeros. -inf reached when we take log of 0
    vec[vec==-np.inf]=0
    #Normalizing the vector
    vec=vec/(np.sqrt(sum(vec**2)))
    #Storing the vector
    doc_vec[doc_id]=vec
print(doc_vec)

In [None]:
#Storing the dictionaries in pickle files
if not os.path.exists("./Storage") :
    os.mkdir ("./Storage")

doc_vec_file = open('./Storage/doc_vec.pkl', 'wb') 
pickle.dump(doc_vec, doc_vec_file) 
doc_vec_file.close()

doc_freq_file = open('./Storage/doc_freq.pkl', 'wb') 
pickle.dump(doc_freq, doc_freq_file) 
doc_freq_file.close()

doc_no_file = open('./Storage/doc_no.pkl', 'wb') 
pickle.dump(doc_no, doc_no_file) 
doc_no_file.close()

words_file = open('./Storage/words.pkl', 'wb') 
pickle.dump(words, words_file) 
words_file.close()

diction_file = open('./Storage/diction.pkl', 'wb') 
pickle.dump(diction, diction_file) 
diction_file.close()

doc_vec_file = open('./Storage/doc_vec.pkl', 'wb') 
pickle.dump(doc_vec, doc_vec_file) 
doc_vec_file.close()

df.to_pickle('./Storage/df.pkl')