**Run these cells for training**

In [4]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai

Looking in links: https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html


In [5]:
!apt-get install python-dev libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \
flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig libpulse-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libjpeg-dev is already the newest version (8c-2ubuntu8).
python-dev is already the newest version (2.7.15~rc1-1).
antiword is already the newest version (0.37-11build1).
flac is already the newest version (1.3.2-1).
lame is already the newest version (3.100-2).
pstotext is already the newest version (1.9-6build1).
swig is already the newest version (3.0.12-1).
tesseract-ocr is already the newest version (4.00~git2288-10f4998a-2).
unrtf is already the newest version (0.21.9-clean-3).
libpulse-dev is already the newest version (1:11.1-1ubuntu7.11).
libxml2-dev is already the newest version (2.9.4+dfsg1-6.1ubuntu1.3).
libxslt1-dev is already the newest version (1.1.29-5ubuntu0.2).
poppler-utils is already the newest version (0.62.0-2ubuntu2.12).
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
libmad0 is already the newest version (0.15.1b-9ubuntu18.04.1).
libsox-fmt-mp3 is already th

In [6]:
!pip install text-preprocessing
!pip install textract
!pip install azure-storage-blob



In [7]:
import nltk
nltk.download('stopwords', quiet = True)
nltk.download('punkt', quiet = True)
nltk.download('words',quiet = True)

True

In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [9]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from text_preprocessing import preprocess_text
import textract
from functools import partial
import re
import io
import os

**TRAINING**

In [None]:
rootdir = 'Specify the path of root directory here'

In [10]:
def unique_list(l):
    ulist = []
    [ulist.append(x) for x in l if x not in ulist]
    return ulist

In [11]:
def process(rootdir, train = True):
  #File path.
  paths = []
  #File name. 
  fname = []
  #Textracted content from file.
  descr = []
  #Labels of file.
  label = []
  #Length of each document.
  length = []
  # Store the list of stop words in english language.
  stop_words = set(stopwords.words('english'))
  # Store the list of words common in english language.  
  words = set(nltk.corpus.words.words())

  # Walking through folders & subfolders in the root directory.
  for subdir, dirs, files in os.walk(rootdir):
    extract_label = subdir.split('/')
    for file in files:
        # Append the path of the file to path variable.
        paths.append(os.path.join(subdir, file))
        # Append the filename to the filename variable.
        fname.append(str(file))
        # Append the label to label variable.
        label.append(int(extract_label[-1])) #[Optional May Change Depending on your folder structure].

        # Extract the text from the files and decode the byte string to text.
        text = textract.process(os.path.join(subdir, file)).decode("utf-8") 
        
        # Preprocess the text with a custom library. (Includes stemming, lemmatization, removal of special characters).
        t = preprocess_text(text)
  
        # Tokenize the text.
        word_tokens = word_tokenize(t)  
  
        # Apply a filter to the text which removes stop words.
        filtered_sentence = [w for w in word_tokens if w not in stop_words]  
        
        # Remove charcaters which are not properly processed in the text.
        filtered_sentence = [w for w in filtered_sentence if len(w) > 3]

        # Remove any numeric characters that got included in text
        filtered_sentence = [''.join(x for x in i if x.isalpha()) for i in filtered_sentence]

        # Join the tokens with space(' ') as delimiter.
        filtered_sentence = " ".join(filtered_sentence)

        # Remove extra spaces in the text.
        res = re.sub(' +', ' ', filtered_sentence) 

        # Join the text.
        a=' '.join(unique_list(res.split()))

        # Remove the words that are not present in english language.
        a = " ".join(w for w in nltk.wordpunct_tokenize(a) \
              if w.lower() in words or not w.isalpha())
        
        # Append the text to description variable.
        descr.append(a)

        # Append the length of text to length variable.
        length.append(len(a))
    
  if train:
    # Converting the target variable to a numpy array.
    label = np.array(label)
  
  return {"FileName" : fname, "FilePath" : paths, "Text" : descr ,"Label" : label, "Length" : length}

def saveandsplit(Data_Frame):
  # Dropping NaN values.
  Data_Frame['Text'].isnull().sum()
  Data_Frame.dropna(inplace = True)

  # Saving the dataset to a csv for future use. [OPTIONAL STEP]
  Data_Frame.to_csv('Dataset.csv', encoding='utf-8', index = False)

  # Remove column names 'FileName' & 'FilePath from Dataframe for training. 
  Data_Frame.drop(['FileName', 'FilePath', 'Length'], axis = 1, inplace = True)

  # split data into training and validation set
  DataFrame_Train, DataFrame_Val = train_test_split(Data_Frame, stratify = Data_Frame['Label'], test_size = 0.2, random_state = 100)

  #reseting index for test_data
  DataFrame_Val.reset_index(drop=True, inplace=True)

  #resting index for train_data
  DataFrame_Train.reset_index(drop=True, inplace=True)

  return DataFrame_Train, DataFrame_Val

def prepareData(DataFrame_Train, DataFrame_Val):
  # Language model data
  data_lm = (TextList.from_df(df=DataFrame_Train, cols=['Text', 'Label']).split_by_rand_pct(0.2).label_for_lm().databunch(bs=4, bptt=80, num_workers=0))

  # Classifier model data
  data_clas = TextClasDataBunch.from_df(".", train_df=DataFrame_Train,valid_df=DataFrame_Val, vocab=data_lm.train_ds.vocab, text_cols='Text', label_cols='Label',bs=2)

  return data_lm, data_clas

In [None]:
Data        = process(rootdir)
Data_Frame  = pd.DataFrame(Data, columns = ['FileName', 'FilePath', 'Text' ,'Label', 'Length'])
DataFrame_Train, DataFrame_Val = saveandsplit(Data_Frame)
data_lm, data_clas = prepareData(DataFrame_Train, DataFrame_Val)

learn = language_model_learner(data_lm, arch = AWD_LSTM, pretrained = True, drop_mult=0.4)
learn.fit_one_cycle(1, 1e-2)
#learn.save('stage-1')

# Save this encoder to use it for classification later
learn.save_encoder('ft_enc')

learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.4)
learn.load_encoder('ft_enc')

learn.lr_find()
learn.recorder.plot()

# Fit again the model
learn.fit_one_cycle(2, 1e-1)
#learn.save('stage-2a')

# Export pkl file.
learn.export()