In [1]:
import re
from sklearn.ensemble import RandomForestClassifier
import joblib
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

max_features = 6000
lst_stopwords = joblib.load('stopwords.joblib')
accepted_words = joblib.load('accepted_words.joblib')
icms_dct = joblib.load('icms1_dictionnary.joblib') 
Cat = joblib.load('Categories_ML.joblib')
clf = joblib.load('Random_Forests_trained_model.joblib.gz');

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    #remove if only two characters
    text_big = re.sub(r'\W*\b\w{1,2}\b', '', text) 
          
    ## Tokenize (convert from string to list)
    lst_text = text_big.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
    
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
        ## removing tags
        ## removing digits
    ## back to string from list
    text = " ".join(lst_text)
    return text

def clean_text_data(df,col):
    # Separate data
    desc_lower = df[col]
    # Remove text before "|" character
    desc_split = desc_lower.str.split("|")
    desc_strip = desc_split.apply(lambda x: x[1] if len(x) > 1 else x[0])
    # Removing digits and words containing digits
    desc_nodigits = desc_strip.apply(lambda x: re.sub("\w*\d\w*", "", x))
    # Removing punctuation
    desc_nopunc = desc_nodigits.apply(lambda x: re.sub(r"[^\w\s]", "", x))
    # Removing additional whitespace
    desc_clean = desc_nopunc.apply(lambda x: re.sub(' +', ' ', x))
    return desc_clean

def goodCode(name,code,desc):
    A = {'success' : 'true' , "Message" : name ,'ICMS' : code ,"Description" : desc}
    R = A['ICMS'].split('.')
    D = [x.strip() for x  in A['Description'].split('\\')]
    A['R2'] = R[0]
    A['R3'] = R[1]
    A['R4'] = R[2]

    A['Desc2'] = D[0]
    A['Desc3'] = D[1]
    A['Desc4'] = D[2]
    return A

def predict(names):
    if type(names) == str:
        names = [names]
    A = pd.DataFrame()
    A['comment_list'] = names
    A['comment_list'] = A.comment_list.apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))
    A['comment_list_new'] = clean_text_data(A,'comment_list')
    vectorizer = CountVectorizer(max_features=max_features,strip_accents='ascii',vocabulary=accepted_words)
    X = vectorizer.fit_transform(A['comment_list_new']).toarray()
    #X = X.astype(dtype=bool).astype(dtype=int)

    y_pred = clf.predict(X)
    codes = [Cat[x] for x in y_pred]
    descs =[icms_dct[c] for c in codes]
    answer = [goodCode(names[i],codes[i],descs[i]) for i,_ in enumerate(names)]
    return answer

[nltk_data] Downloading package punkt to /Users/ji-deza/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ji-deza/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/ji-deza/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ji-deza/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ji-
[nltk_data]     deza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [2]:
predict('Clear site of all signs and equipment (including diversion routes)')[0]


{'success': 'true',
 'Message': 'Clear site of all signs and equipment (including diversion routes)',
 'ICMS': '1.08.060',
 'Description': "Capital Construction Costs \\Preliminaries | Constructor's site overheads | general requirements\\ Other temporary facilities and services",
 'R2': '1',
 'R3': '08',
 'R4': '060',
 'Desc2': 'Capital Construction Costs',
 'Desc3': "Preliminaries | Constructor's site overheads | general requirements",
 'Desc4': 'Other temporary facilities and services'}

In [5]:
test = pd.read_csv('renewals_sample1.csv')
A = predict(test.Description.to_list())
test['ICMS_ML']=[x['ICMS'] for x in A]
test['A']=[x['Desc2'] for x in A]
test['B']=[x['Desc3'] for x in A]
test['C']=[x['Desc4'] for x in A]

test[test['ICMS'] != test['ICMS_ML']].shape[0] / test.shape[0]


0.27

In [7]:
test.to_csv('discrepancy.csv')

In [None]:
# Fix an error in the icms dict
import joblib
import pandas as pd
tmp = {}
tmp['Description'] = joblib.load('icms1_dictionnary.joblib')
tmp = pd.DataFrame(data=tmp)
tmp['Description'].to_csv('Desc.csv')
tmp[['A','B','C']] = tmp.Description.str.split('\\',expand=True)
tmp.loc[tmp.B.str.contains("Preliminaries \| Cons") ,'B'] =  "Preliminaries | Constructor\'s site overheads | general requirements"
tmp.Description = tmp.A + '\\' + tmp.B + '\\' + tmp.C
joblib.dump(tmp['Description'].to_dict(),'icms1_dictionnary.joblib')