In [None]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai

In [None]:
!apt-get install python-dev libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \
flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig libpulse-dev

In [None]:
!pip install text-preprocessing
!pip install textract
!pip install azure-storage-blob

In [None]:
import nltk
nltk.download('stopwords', quiet = True)
nltk.download('punkt', quiet = True)
nltk.download('words',quiet = True)

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [None]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from text_preprocessing import preprocess_text
import textract
from functools import partial
import re
import io
import os

**MODEL 1**

In [None]:
def unique_list(l):
    ulist = []
    [ulist.append(x) for x in l if x not in ulist]
    return ulist

def process(rootdir, train = True):
  #File path.
  paths = []
  #File name. 
  fname = []
  #Textracted content from file.
  descr = []
  #Labels of file.
  label = []
  #Length of each document.
  length = []
  # Store the list of stop words in english language.
  stop_words = set(stopwords.words('english'))
  # Store the list of words common in english language.  
  words = set(nltk.corpus.words.words())

  # Walking through folders & subfolders in the root directory.
  for subdir, dirs, files in os.walk(rootdir):
    extract_label = subdir.split('/')
    for file in files:
        # Append the path of the file to path variable.
        paths.append(os.path.join(subdir, file))
        # Append the filename to the filename variable.
        fname.append(str(file))
        # Append the label to label variable.
        label.append(int(extract_label[-1])) #[Optional May Change Depending on your folder structure].

        # Extract the text from the files and decode the byte string to text.
        text = textract.process(os.path.join(subdir, file)).decode("utf-8") 
        
        # Preprocess the text with a custom library. (Includes stemming, lemmatization, removal of special characters).
        t = preprocess_text(text)
  
        # Tokenize the text.
        word_tokens = word_tokenize(t)  
  
        # Apply a filter to the text which removes stop words.
        filtered_sentence = [w for w in word_tokens if w not in stop_words]  
        
        # Remove charcaters which are not properly processed in the text.
        filtered_sentence = [w for w in filtered_sentence if len(w) > 3]

        # Remove any numeric characters that got included in text
        filtered_sentence = [''.join(x for x in i if x.isalpha()) for i in filtered_sentence]

        # Join the tokens with space(' ') as delimiter.
        filtered_sentence = " ".join(filtered_sentence)

        # Remove extra spaces in the text.
        res = re.sub(' +', ' ', filtered_sentence) 

        # Join the text.
        a=' '.join(unique_list(res.split()))

        # Remove the words that are not present in english language.
        a = " ".join(w for w in nltk.wordpunct_tokenize(a) \
              if w.lower() in words or not w.isalpha())
        
        # Append the text to description variable.
        descr.append(a)

        # Append the length of text to length variable.
        length.append(len(a))
    
  if train:
    # Converting the target variable to a numpy array.
    label = np.array(label)
  
  return {"FileName" : fname, "FilePath" : paths, "Text" : descr ,"Label" : label, "Length" : length}


def preprocess(Data_Frame):
    #Dropping NaN values.
    Data_Frame['Text'].isnull().sum()
    Data_Frame.dropna(inplace = True)

    # Remove column names 'FileName' & 'FilePath from Dataframe for training. 
    Data_Frame.drop(['Length', 'Label'], axis = 1, inplace = True)

    return Data_Frame

def model_1(rootdir, search, labels, rlabels):

    learn      = load_learner('/home/yaswant/code/console/python/m1/') # Fixed path for model.

    Data        =  process(rootdir)
    Data_Frame  =  pd.DataFrame(Data, columns = ['FileName', 'FilePath', 'Text' ,'Label', 'Length'])
    
    Data_Frame = preprocess(Data_Frame)

    target = []
    for i in range(len(Data_Frame)) : 
        target.append(learn.predict(str(Data_Frame.loc[i, "Text"])))

    result = []
    for i in range(len(target)):
        target1 = target[i][0]
        res = int("".join(re.findall(r'\d+', str(target1))))
        result.append(labels[res])

    Data_Frame["target"] = result

    if search == "*":
        Data_Frame.to_csv('result.csv', encoding='utf-8', index = False)
    else:
        Data_Frame = Data_Frame.loc[Data_Frame['target'] == rlabels[search]]
        #reseting index for test_data
        Data_Frame.reset_index(drop=True, inplace=True)

    return Data_Frame

**MODEL 2**

In [None]:
def model_2(rootdir, search, labels, rlabels):

    learn = load_learner('/home/yaswant/code/console/python/model2/') # Fixed path for model.

    Data        =  process(rootdir)
    Data_Frame  =  pd.DataFrame(Data, columns = ['FileName', 'FilePath', 'Text' ,'Label', 'Length'])
    
    Data_Frame = preprocess(Data_Frame)

    target = []
    for i in range(len(Data_Frame)) : 
        target.append(learn.predict(str(Data_Frame.loc[i, "Text"])))

    result = []
    for i in range(len(target)):
        target1 = target[i][0]
        res = int("".join(re.findall(r'\d+', str(target1))))
        result.append(labels[res])

    Data_Frame["target"] = result

    if search == "*":
        Data_Frame.to_csv('result.csv', encoding='utf-8', index = False)
    else:
        Data_Frame = Data_Frame.loc[Data_Frame['target'] == rlabels[search]]
        #reseting index for test_data
        Data_Frame.reset_index(drop=True, inplace=True)

    return Data_Frame

**ACTIVE LEARNING**

In [None]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
import pandas as pd
import json
import requests
import os
import time
import urllib.parse

In [None]:
def act_learn(rootdir, search, cont_name, folder):
    paths = []
    fname = []
    search_term = urllib.parse.quote(str(search)) 
    print("File is being downloaded it may take upto 5 minutes")
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            paths.append(os.path.join(subdir, file))
            fname.append(str(file))
    no_of_docs = int(len(fname))
    
    data1 = {"FileName" : fname, "FilePath" : paths, "Label" : [None] * len(paths)}
    df1 = pd.DataFrame(data1, columns = ['FileName', 'FilePath', 'Label'])
    
    # Retrieve the connection string for use with the application. 
    connect_str = "DefaultEndpointsProtocol=https;AccountName=eystrg;AccountKey=LY8gwsb1IAO2cY79EhfaA17BJHFSEzza0QE58L9nweCfiinr2ci+h9ZmCFjv92rRoHJkdQ/kl+7Aw6ti6BtdAQ==;EndpointSuffix=core.windows.net"

    # Create the BlobServiceClient object which will be used to create a container client
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)

    for ind in df1.index:
        #Uploading file to azure Blob Storage
        upload_file_path = df1['FilePath'][ind]
        local_file_name = str(folder) + "/" + df1['FileName'][ind]
        # Specify the container (Dynamic Drop Down)
        container_name = cont_name

        # Create a blob client using the local file name as the name for the blob
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=local_file_name)

        # Upload the created file
        with open(upload_file_path, "rb") as data:
            blob_client.upload_blob(data)

    # Define the names for the data source, skillset, index and indexer
    datasource_name = "cogsrch-py-datasource" + str(folder)
    skillset_name = "cogsrch-py-skillset" + str(folder)
    index_name = "cogsrch-py-index" + str(folder)
    indexer_name = "cogsrch-py-indexer" + str(folder)

    # Setup the endpoint
    endpoint = 'https://eandysearch.search.windows.net'
    headers = {'Content-Type': 'application/json',
    'api-key': 'A15802993B784F745D50071F67BC731E'}
    params = {'api-version': '2020-06-30'}
    print("74")
    # Create a data source
    datasourceConnectionString = connect_str
    datasource_payload = {
    "name": datasource_name,
    "description": "Demo files to demonstrate cognitive search capabilities.",
    "type": "azureblob",
    "credentials": {
        "connectionString": datasourceConnectionString
    },
    "container": {
        "name": cont_name,
        "query" : str(folder) + "/"
    }}
    r = requests.put(endpoint + "/datasources/" + datasource_name,
                data=json.dumps(datasource_payload), headers=headers, params=params)
    print("90")
    # Create a skillset
    skillset_payload = {
    "name": skillset_name,
    "description":
    "Extract entities, detect language and extract key-phrases",
    "skills":
    [
        {
            "@odata.type": "#Microsoft.Skills.Text.EntityRecognitionSkill",
            "categories": ["Organization"],
            "defaultLanguageCode": "en",
            "inputs": [
                {
                    "name": "text", 
                    "source": "/document/content"
                }
            ],
            "outputs": [
                {
                    "name": "organizations", 
                    "targetName": "organizations"
                }
            ]
        },
        {
            "@odata.type": "#Microsoft.Skills.Text.LanguageDetectionSkill",
            "inputs": [
                {
                    "name": "text", 
                    "source": "/document/content"
                }
            ],
            "outputs": [
                {
                    "name": "languageCode",
                    "targetName": "languageCode"
                }
            ]
        },
        {
            "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
            "textSplitMode": "pages",
            "maximumPageLength": 4000,
            "inputs": [
                {
                    "name": "text",
                    "source": "/document/content"
                },
                {
                    "name": "languageCode",
                    "source": "/document/languageCode"
                }
            ],
            "outputs": [
                {
                    "name": "textItems",
                    "targetName": "pages"
                }
            ]
        },
        {
            "@odata.type": "#Microsoft.Skills.Text.KeyPhraseExtractionSkill",
            "context": "/document/pages/*",
            "inputs": [
                {
                    "name": "text", 
                    "source": "/document/pages/*"
                },
                {
                    "name": "languageCode", 
                    "source": "/document/languageCode"
                }
            ],
            "outputs": [
                {
                    "name": "keyPhrases",
                    "targetName": "keyPhrases"
                }
            ]
        }
    ]}

    r = requests.put(endpoint + "/skillsets/" + skillset_name,
                data=json.dumps(skillset_payload), headers=headers, params=params)
    print("175")
    # Create an index
    index_payload = {
    "name": index_name,
    "fields": [
        {
            "name": "id",
            "type": "Edm.String",
            "key": "true",
            "searchable": "true",
            "filterable": "false",
            "facetable": "false",
            "sortable": "true"
        },
        {
            "name": "metadata_storage_name",
            "type": "Edm.String",
            "facetable": "false",
            "filterable": "false",
            "key": "false",
            "retrievable": "true",
            "searchable": "true",
            "sortable": "false",
            "analyzer": "standard.lucene",
        },
        {
            "name": "content",
            "type": "Edm.String",
            "sortable": "false",
            "searchable": "true",
            "filterable": "false",
            "facetable": "false"
        },
        {
            "name": "languageCode",
            "type": "Edm.String",
            "searchable": "true",
            "filterable": "false",
            "facetable": "false"
        },
        {
            "name": "keyPhrases",
            "type": "Collection(Edm.String)",
            "searchable": "true",
            "filterable": "false",
            "facetable": "false"
        },
        {
            "name": "organizations",
            "type": "Collection(Edm.String)",
            "searchable": "true",
            "sortable": "false",
            "filterable": "false",
            "facetable": "false"
        }
    ]}

    r = requests.put(endpoint + "/indexes/" + index_name,
                data=json.dumps(index_payload), headers=headers, params=params)

    print("235")

    # Create an indexer
    indexer_payload = {
    "name": indexer_name,
    "dataSourceName": datasource_name,
    "targetIndexName": index_name,
    "skillsetName": skillset_name,
    "fieldMappings": [
        {
            "sourceFieldName": "metadata_storage_path",
            "targetFieldName": "id",
            "mappingFunction":
            {"name": "base64Encode"}
        },
        {
            "sourceFieldName": "content",
            "targetFieldName": "content"
        }
    ],
    "outputFieldMappings":
    [
        {
            "sourceFieldName": "/document/organizations",
            "targetFieldName": "organizations"
        },
        {
            "sourceFieldName": "/document/pages/*/keyPhrases/*",
            "targetFieldName": "keyPhrases"
        },
        {
            "sourceFieldName": "/document/languageCode",
            "targetFieldName": "languageCode"
        }
    ],
    "parameters":
    {
        "maxFailedItems": 0,
        "maxFailedItemsPerBatch": 0,
        "configuration":
        {
            "dataToExtract": "contentAndMetadata",
            "imageAction": "generateNormalizedImages"
        }
    }}

    r = requests.put(endpoint + "/indexers/" + indexer_name,
                data=json.dumps(indexer_payload), headers=headers, params=params)

    # Get indexer status
    r = requests.get(endpoint + "/indexers/" + indexer_name +
                "/status", headers=headers, params=params)
    print("287")
    s_indexer = r.json()
    time.sleep(60)
    while s_indexer["lastResult"]["itemsProcessed"] != no_of_docs:
        time.sleep(20)
        # Get indexer status
        r = requests.get(endpoint + "/indexers/" + indexer_name +
                "/status", headers=headers, params=params)
        s_indexer = r.json()
    print("295")
    # Query the index to return the contents of organizations
    r = requests.get(endpoint + "/indexes/" + index_name +
                "/docs?&search="+search_term, headers=headers, params=params)

    s = r.json()

    search_score = []
    doc_name = []
    for k1,v1 in s.items():
        if k1 == "value":
            for i in range(len(v1)):
                for k2,v2 in v1[i].items():
                    if k2 == "@search.score":
                        print("Search score is {}".format(v2))
                        search_score.append(v2)
                    elif k2 == "metadata_storage_name":
                        print("Document name is {}".format(v2))
                        doc_name.append(v2)
                        print("*" * (15))
                    else:
                        pass
    data2 = {"FileName" : doc_name, "SearchScore" : search_score}
    df2 = pd.DataFrame(data2, columns = ['FileName', 'SearchScore'])

    df3 = pd.merge(left=df1, right=df2, left_on='FileName', right_on='FileName')

    df3 = df3.sort_values(by = ['SearchScore'], ascending=False, ignore_index=True)

    df3["Label"] = str(search)
    
    #print("The result file is downloaded at {}".format(rootdir + 'Result.csv'))

    # delete the skillset
    r = requests.delete(endpoint + "/skillsets/" + skillset_name,
                    headers=headers, params=params)


    return df3

**TEST_PROTOTYPE**

In [None]:
if __name__ == "__main__":
    Dep_name = input()
    User_id = input()
    rootdir = input()
    search = input()
    labels_m1 = {0 : "religion", 1 : "computers", 2 : "sale", 3 : "recreation", 4 : "science", 5 : "politics"}
    rlabels_m1 = {"religion" : 0, "computers" : 1, "sale" : 2, "recreation" : 3, "science" : 4, "politics" : 5}
    labels_m2 = {0:"bills", 1:"case study", 2:"coding guidelines", 3:"product management toolkit", 4 : "rules and regulations", 5 : "faq"}
    rlabels_m2 = {"bills" : 0, "case study" : 1, "coding guidelines" : 2, "product management toolkit" : 3, "rules and regulations" : 4, "faq" :5}
    
    if Dep_name == "Department 1" and rlabels_m1[search]:
        # Model 1
        print("21")
        Data_Frame = model_1(rootdir, search, labels_m1, rlabels_m1)
        Data_Frame.to_csv(User_id + 'result.csv', encoding='utf-8', index = False)
    elif Dep_name == "Department 2" and rlabels_m2[search]:
        # Model 2
        Data_Frame = model_2(rootdir, search, labels_m2, rlabels_m2)
        Data_Frame.to_csv(User_id + 'result.csv', encoding='utf-8', index = False)
    else:
        # Active Learning
        Data_Frame = act_learn(rootdir, search, Dep_name, User_id)
        Data_Frame.to_csv(User_id + 'result.csv', encoding='utf-8', index = False)