## Dataset Preprocessing  
---
The index.html files are removed and 15 files from the SRE folder are shifted to stories folder. The resulting 467 files are preprocessed before creating the Unigram Inverted Index. Necessary steps are undertaken to clean the document text. The documents' names, original texts and cleaned texts obtained after preprocessing are stored in a pickle file **docs.pkl**.

In [1]:
# Importing relevent modules for preprocessing

import re
import string
import codecs
import pickle

from tqdm import tqdm
from pathlib import Path
from substitutions import appos
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
# Reading the 467 documents and storing the document name and document text

Data = []
Unresolved = []

for i in Path("../stories").glob("*"):
    try:
        with codecs.open(i,'r', encoding = 'utf-8', errors = 'ignore') as f:
            d = str(i).split('/')[-1]
            t =  ' '.join(f.readlines())
            Data.append({'doc_name' : d, 'text' : t})
    except:
        Unresolved.append(str(i))
        
print("Total Documents :",len(Data))
print("Unresolved Documents :",Unresolved)

Total Documents : 467
Unresolved Documents : []


In [3]:
# Function to clean document text

def clean(text):
    
    # Converting all text to lowercase
    text = text.lower()
        
    # Substituting words with apostrophe with their appropiate phrases
    text = ' '.join([appos[word] if word in appos else word for word in word_tokenize(text)])
    
    # Removing all punctuation and unecessary characters from text
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Removing stopwords from text
    stop_words = set(stopwords.words("english"))
    text = ' '.join([w for w in word_tokenize(text) if not w in stop_words])
    
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(w) for w in word_tokenize(text) if not w in stop_words])
    
    return text

In [4]:
# Cleaning

for doc in tqdm(Data):
    doc['cleaned_text'] = clean(doc['text'])

100%|██████████| 467/467 [00:38<00:00, 12.07it/s]


In [5]:
# Sorting documents in Alphabetical order

Data.sort(key = lambda x:x['doc_name'])

In [6]:
# Dumping in Pickle File

pickle.dump(Data,open('../Dumps/docs.pkl','wb'))