In [1]:
import pandas as pd
import pyodbc
import numpy as np

In [2]:
# Import data from SQL Server database
sql_conn = pyodbc.connect('DRIVER={SQL Server};\
                            SERVER=sql1944-fm1-in.amr.corp.intel.com,3181;\
                            DATABASE=GSMDW_TST;\
                            Trusted_Connection=yes;\
                            integrated security=true')

query= """SELECT distinct itm_id,line_net_prc_USD_amt
FROM [gsmdw_tst].[JUSTIN].[CDM_OA]
UNION
SELECT distinct [itm_id],[new_buy_prc_usd_amt]
FROM [gsmdw_tst].[JUSTIN].[CDM_WIINGS]"""
df_System = pd.read_sql(query, sql_conn)
sql_conn.close()

In [3]:
# Import data from SQL Server database
sql_conn = pyodbc.connect('DRIVER={SQL Server};\
                            SERVER=azscia01.amr.corp.intel.com,1433;\
                            DATABASE=CDM;\
                            Trusted_Connection=yes;\
                            integrated security=true')

query= """SELECT T.[DocumentId]
      ,T.[TableNumber]
      ,T.[RowNumber]
      ,T.[ColumnNumber]
      ,T.[ColumnValue]
      ,T.[TableConfidence]
  FROM [CDM].[dbo].[tablesunpivoted] AS T
  JOIN [dbo].[draftpublishedcontracts] AS C
  ON C.[Document Id] = T.DocumentId
  WHERE C.[Doc Status] IN ('Published','Draft')"""
df_Contract = pd.read_sql(query, sql_conn)
sql_conn.close()

In [4]:
df_System['itm_id'] = df_System['itm_id'].astype(int).astype(str)
df_Merge = pd.merge(df_System,
                    df_Contract,
                    left_on='itm_id',
                    right_on='ColumnValue',
                    how='inner')

In [5]:
df_ContractTable = pd.merge(df_Contract,
                            df_Merge,
                            left_on=['DocumentId','TableNumber','RowNumber'],
                            right_on=['DocumentId','TableNumber','RowNumber'],how='left')

In [6]:
def is_number(nbr):
    try:
        float(nbr)
        return True
    except ValueError:
        return False

def clean(unclean):
    unclean = unclean.replace('$', '')
    unclean = unclean.replace(',','')
    unclean = unclean.strip()
    if is_number(unclean):
        unclean = float(unclean)
    return unclean

In [7]:
df_ContractTable['Value_clean'] = df_ContractTable.ColumnValue_x.apply(lambda x : clean(x))

In [8]:
df_ContractTable['Label'] = np.where(df_ContractTable['line_net_prc_USD_amt'].astype(str) == df_ContractTable['Value_clean'].astype(str), 'Price', 'NotPrice')
df_training = df_ContractTable[['DocumentId','TableNumber','ColumnNumber_x','ColumnValue_x','Label']]
df_training.rename(columns={'ColumnNumber_x':'ColumnNumber','ColumnValue_x':'ColumnValue'}, inplace=True)
df_training

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


Unnamed: 0,DocumentId,TableNumber,ColumnNumber,ColumnValue,Label
0,Doc1000015244,0,0,BY:,NotPrice
1,Doc1000015244,0,1,BY:,NotPrice
2,Doc1000015244,0,0,NAME:,NotPrice
3,Doc1000015244,0,1,NAME:,NotPrice
4,Doc1000015244,0,0,TITLE:,NotPrice
...,...,...,...,...,...
9864162,Doc999948248,16,3,,NotPrice
9864163,Doc999948248,16,0,Retention Mgt to Cert Records Mgr Upgrade,NotPrice
9864164,Doc999948248,16,1,457-100-452,NotPrice
9864165,Doc999948248,16,2,SEAT,NotPrice


In [9]:
df_filtered = df_training[df_training['Label'] == 'Price']
df_grouped = df_filtered.groupby(['DocumentId','TableNumber','ColumnNumber'])['ColumnNumber'].count().reset_index(name="count")
df_grouped

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,count
0,Doc1003512767,3,3,1
1,Doc1003512767,3,4,9
2,Doc1005740068,3,2,1
3,Doc1008326585,46,2,11
4,Doc1008326585,47,2,3
...,...,...,...,...
1870,Doc977436468,6,7,11
1871,Doc979301475,6,7,11
1872,Doc981223975,7,7,11
1873,Doc996552509,0,3,1


In [10]:
df_grouped = df_grouped.loc[df_grouped.groupby(["DocumentId", "TableNumber"])["count"].idxmax()]
df_grouped['DocNumTableNum'] = df_grouped['DocumentId'].apply(str) + '/' + df_grouped['TableNumber'].apply(str)
df_grouped['DocNumTableNumColumnNumber'] = df_grouped['DocumentId'].apply(str) + '/' + df_grouped['TableNumber'].apply(str) + '/' + df_grouped['ColumnNumber'].apply(str)
df_grouped

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,count,DocNumTableNum,DocNumTableNumColumnNumber
1,Doc1003512767,3,4,9,Doc1003512767/3,Doc1003512767/3/4
2,Doc1005740068,3,2,1,Doc1005740068/3,Doc1005740068/3/2
3,Doc1008326585,46,2,11,Doc1008326585/46,Doc1008326585/46/2
4,Doc1008326585,47,2,3,Doc1008326585/47,Doc1008326585/47/2
5,Doc1008326585,48,3,2,Doc1008326585/48,Doc1008326585/48/3
...,...,...,...,...,...,...
1866,Doc975084460,2,16,10,Doc975084460/2,Doc975084460/2/16
1870,Doc977436468,6,7,11,Doc977436468/6,Doc977436468/6/7
1871,Doc979301475,6,7,11,Doc979301475/6,Doc979301475/6/7
1872,Doc981223975,7,7,11,Doc981223975/7,Doc981223975/7/7


In [11]:
df_t = df_training.groupby(['DocumentId','TableNumber','ColumnNumber']).ColumnValue.unique().reset_index()
df_t

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,ColumnValue
0,Doc1000015244,0,0,"[BY:, NAME:, TITLE:]"
1,Doc1000015244,0,1,"[BY:, NAME:, TITLE:]"
2,Doc1000015244,1,0,"[Equipment\nModel, Synergis MX, Notes]"
3,Doc1000015244,1,1,"[Material\nMaster\nNumber, TBD, - Ozone Genera..."
4,Doc1000015244,1,2,"[Material\nMaster\nDescription, TBD]"
...,...,...,...,...
763270,Doc999948248,15,3,"[, Yes, Hardware Costs, yes]"
763271,Doc999948248,16,0,"[German Language Pack, Spanish Language Pack, ..."
763272,Doc999948248,16,1,"[456-101-188, 456-101-190, 456-101-189, 457-10..."
763273,Doc999948248,16,2,"[Per Discovered\nCustodian, Per Reviewed\nCust..."


In [12]:
df_t['DocNumTableNum'] = df_t['DocumentId'] + '/' + df_t['TableNumber'].apply(str)
df_t['DocNumTableNumColumnNumber'] = df_t['DocumentId'] + '/' + df_t['TableNumber'].apply(str) + '/' + df_t['ColumnNumber'].apply(str)
df_t['Label'] = np.where(df_t['DocNumTableNumColumnNumber'].isin(df_grouped.DocNumTableNumColumnNumber.tolist()), 'Price', 'Other')
df_t['PriceInTable'] = np.where(df_t['DocNumTableNum'].isin(df_grouped.DocNumTableNum.tolist()), 'Yes','No')
df_t['Label'] = df_t.apply(lambda row: 'NotPrice' if (row['Label']=='Other')&(row['PriceInTable'] == 'Yes' ) else row['Label'],axis=1)
#df_t[df_t['Label']=='Other']
#df_t[(df_t['Label']=='Other')&(df_t['test']=='NotPrice')]
df_t

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,ColumnValue,DocNumTableNum,DocNumTableNumColumnNumber,Label,PriceInTable
0,Doc1000015244,0,0,"[BY:, NAME:, TITLE:]",Doc1000015244/0,Doc1000015244/0/0,Other,No
1,Doc1000015244,0,1,"[BY:, NAME:, TITLE:]",Doc1000015244/0,Doc1000015244/0/1,Other,No
2,Doc1000015244,1,0,"[Equipment\nModel, Synergis MX, Notes]",Doc1000015244/1,Doc1000015244/1/0,Other,No
3,Doc1000015244,1,1,"[Material\nMaster\nNumber, TBD, - Ozone Genera...",Doc1000015244/1,Doc1000015244/1/1,Other,No
4,Doc1000015244,1,2,"[Material\nMaster\nDescription, TBD]",Doc1000015244/1,Doc1000015244/1/2,Other,No
...,...,...,...,...,...,...,...,...
763270,Doc999948248,15,3,"[, Yes, Hardware Costs, yes]",Doc999948248/15,Doc999948248/15/3,Other,No
763271,Doc999948248,16,0,"[German Language Pack, Spanish Language Pack, ...",Doc999948248/16,Doc999948248/16/0,Other,No
763272,Doc999948248,16,1,"[456-101-188, 456-101-190, 456-101-189, 457-10...",Doc999948248/16,Doc999948248/16/1,Other,No
763273,Doc999948248,16,2,"[Per Discovered\nCustodian, Per Reviewed\nCust...",Doc999948248/16,Doc999948248/16/2,Other,No


In [13]:
import re
import nltk
#nltk.download('stopwords')

def listToString(s):  
    str1 = " "  
    return str1.join(s)

def remove_numbers(wordtoken):
    return re.sub(r'\d',r'_',wordtoken)

def tokenize_string(strng):
    #if not strng:
    #    print('The text to be tokenized is a None type. Defaulting to blank string.')
    #    strng = ''
    return nltk.word_tokenize(strng)

punct= list('"\'()*`{|}~/')
def remove_punct(wordtokens):
    return [wordtoken for wordtoken in wordtokens if wordtoken not in punct]

df_t['Col_Tokens'] = df_t.ColumnValue.apply(listToString)
df_t['Col_Tokens'] = df_t.Col_Tokens.apply(remove_numbers)
df_t['Col_Tokens'] = df_t.Col_Tokens.apply(tokenize_string)

In [14]:
stopwords= set(nltk.corpus.stopwords.words('english'))
def remove_stopwords(wordtokens):
    return [wordtoken for wordtoken in wordtokens if wordtoken not in stopwords]

# Porter Stem Tokens
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_tokens(wordtokens):
    return [ps.stem(word) for word in wordtokens]

df_t['Col_Tokens'] = df_t.Col_Tokens.apply(remove_stopwords)
df_t['Col_Tokens'] = df_t.Col_Tokens.apply(remove_punct)
df_t['Col_Tokens'] = df_t.Col_Tokens.apply(stem_tokens)

In [15]:
df_tr = df_t[(df_t['Label']=='Price') | (df_t['Label']=='NotPrice')]
df_tr[(df_tr['Label']=='Price')]

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,ColumnValue,DocNumTableNum,DocNumTableNumColumnNumber,Label,PriceInTable,Col_Tokens
1686,Doc1003512767,3,4,"[Price\n(US$/pc), 119.42, 114.42, 153.00, for ...",Doc1003512767/3,Doc1003512767/3/4,Price,Yes,"[price, US, $, /pc, ___.__, ___.__, ___.__, de..."
3423,Doc1005740068,3,2,"[Price, $31.00, $1.09, $1.07]",Doc1005740068/3,Doc1005740068/3/2,Price,Yes,"[price, $, __.__, $, _.__, $, _.__]"
4929,Doc1008326585,46,2,"[Price/Unit, $ 4,289.25, $ 1,500.00, Intel Dir...",Doc1008326585/46,Doc1008326585/46/2,Price,Yes,"[price/unit, $, _, ,, ___.__, $, _, ,, ___.__,..."
4934,Doc1008326585,47,2,"[Price/Unit, $ 6,943.05, Intel Direct Source, ...",Doc1008326585/47,Doc1008326585/47/2,Price,Yes,"[price/unit, $, _, ,, ___.__, intel, direct, s..."
4942,Doc1008326585,48,3,"[IPN, 6,943.05, 6,536.00, 500241007, 500282670...",Doc1008326585/48,Doc1008326585/48/3,Price,Yes,"[ipn, _, ,, ___.__, _, ,, ___.__, _________, _..."
...,...,...,...,...,...,...,...,...,...
749834,Doc975084460,2,16,"[PRICE\n$/UM\nChina to\nTMM, 10.20, 17.55, , 8...",Doc975084460/2,Doc975084460/2/16,Price,Yes,"[price, $, /um, china, tmm, __.__, __.__, __._..."
750940,Doc977436468,6,7,"[Unit Price (US$), $ 2.250, $\n2.250]",Doc977436468/6,Doc977436468/6/7,Price,Yes,"[unit, price, US, $, $, _.___, $, _.___]"
751486,Doc979301475,6,7,"[Unit Price (US$), $ 2.250, $\n2.250]",Doc979301475/6,Doc979301475/6/7,Price,Yes,"[unit, price, US, $, $, _.___, $, _.___]"
752181,Doc981223975,7,7,"[Unit Price (US$), $ 2.250, $\n2.250]",Doc981223975/7,Doc981223975/7/7,Price,Yes,"[unit, price, US, $, $, _.___, $, _.___]"


In [16]:
df_tr_final = df_t[df_t['DocNumTableNum'].isin(df_tr.DocNumTableNum.tolist())]
df_tr_final

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,ColumnValue,DocNumTableNum,DocNumTableNumColumnNumber,Label,PriceInTable,Col_Tokens
1682,Doc1003512767,3,0,"[Product, RHB66P, RHB66P-MA, RHB66P-MAH, IND22...",Doc1003512767/3,Doc1003512767/3/0,NotPrice,Yes,"[product, rhb__p, rhb__p-ma, rhb__p-mah, ind__..."
1683,Doc1003512767,3,1,"[Size\n(mm), cp762, tp762, (p762, (p742, cp742...",Doc1003512767/3,Doc1003512767/3/1,NotPrice,Yes,"[size, mm, cp___, tp___, p___, p___, cp___, cp..."
1684,Doc1003512767,3,2,"[IPN, 500181753, 500181869, 500181870, 5001818...",Doc1003512767/3,Doc1003512767/3/2,NotPrice,Yes,"[ipn, _________, _________, _________, _______..."
1685,Doc1003512767,3,3,"[Qty, , pc, 145.00, MOQ: 20 pcs, 20 pcs - 50 p...",Doc1003512767/3,Doc1003512767/3/3,NotPrice,Yes,"[qti, pc, ___.__, moq, :, __, pc, __, pc, -, _..."
1686,Doc1003512767,3,4,"[Price\n(US$/pc), 119.42, 114.42, 153.00, for ...",Doc1003512767/3,Doc1003512767/3/4,Price,Yes,"[price, US, $, /pc, ___.__, ___.__, ___.__, de..."
...,...,...,...,...,...,...,...,...,...
762084,Doc996552509,0,3,"[Qty, , pc, 145.00, MOQ: 20 pcs, 20 pcs - 50 p...",Doc996552509/0,Doc996552509/0/3,NotPrice,Yes,"[qti, pc, ___.__, moq, :, __, pc, __, pc, -, _..."
762085,Doc996552509,0,4,"[Price\n(USS/pc), 119.42, 114.42, 153.00, for ...",Doc996552509/0,Doc996552509/0/4,Price,Yes,"[price, uss/pc, ___.__, ___.__, ___.__, delive..."
762086,Doc996552509,0,5,"[Incoterms, , FOB Marubeni's\nwarehouse nearby...",Doc996552509/0,Doc996552509/0/5,NotPrice,Yes,"[incoterm, fob, marubeni', warehous, nearbi, s..."
762087,Doc996552509,0,6,"[Pad Shelf Life, , 9 months from\nmanufacturin...",Doc996552509/0,Doc996552509/0/6,NotPrice,Yes,"[pad, shelf, life, _, month, manufactur, date,..."


In [17]:
#df_t[(df_t['DocNum']=='Doc971286958') & (df_t['TableNum']=='1')]

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_tr_final.loc[:, 'Col_Tokens'], df_tr_final.loc[:, 'Label'], test_size=0.3, random_state=1)

## Random Forest

In [19]:
def dummy_analyzer(x):
    return x

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
rf_pipeline = Pipeline([
    ('bow', TfidfVectorizer(analyzer=dummy_analyzer)),
    ('RandomForestClassifier', RandomForestClassifier(n_estimators=100))
])

In [20]:
rf_pipeline.fit(X_train, y_train)
rf_predictions= rf_pipeline.predict(X_test)

In [21]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(y_test, rf_predictions))
print(confusion_matrix(y_test, rf_predictions))

              precision    recall  f1-score   support

    NotPrice       0.99      0.99      0.99      3068
       Price       0.94      0.88      0.91       385

    accuracy                           0.98      3453
   macro avg       0.96      0.94      0.95      3453
weighted avg       0.98      0.98      0.98      3453

[[3045   23]
 [  45  340]]


## Extra Trees

In [22]:
from sklearn.ensemble import ExtraTreesClassifier
xtra_pipeline= Pipeline([
    ('bow', TfidfVectorizer(analyzer=dummy_analyzer)),
    ('ExtraTreesClassifier', ExtraTreesClassifier(n_estimators=200))   
])

In [23]:
xtra_pipeline.fit(X_train, y_train)
xtra_predictions= xtra_pipeline.predict(X_test)

In [24]:
print(classification_report(y_test, xtra_predictions))
print(confusion_matrix(y_test, xtra_predictions))

              precision    recall  f1-score   support

    NotPrice       0.99      0.99      0.99      3068
       Price       0.93      0.90      0.92       385

    accuracy                           0.98      3453
   macro avg       0.96      0.94      0.95      3453
weighted avg       0.98      0.98      0.98      3453

[[3044   24]
 [  40  345]]


## SVC

In [25]:
from sklearn.svm import SVC
svc_pipeline= Pipeline([
    ('bow', TfidfVectorizer(analyzer=dummy_analyzer)),
    ('BaggingClassifier', SVC(gamma='scale'))
])

In [26]:
svc_pipeline.fit(X_train, y_train)
svc_predictions= svc_pipeline.predict(X_test)

In [27]:
print(classification_report(y_test, svc_predictions))
print(confusion_matrix(y_test, svc_predictions))

              precision    recall  f1-score   support

    NotPrice       0.97      0.98      0.97      3068
       Price       0.80      0.78      0.79       385

    accuracy                           0.95      3453
   macro avg       0.89      0.88      0.88      3453
weighted avg       0.95      0.95      0.95      3453

[[2995   73]
 [  85  300]]


## Bagging Classifier

In [33]:
from sklearn.ensemble import BaggingClassifier
bag_pipeline= Pipeline([
    ('bow', CountVectorizer(analyzer=dummy_analyzer)),
    ('BaggingClassifier', BaggingClassifier(n_estimators=100))
])

In [34]:
bag_pipeline.fit(X_train, y_train)
bag_predictions= svc_pipeline.predict(X_test)

In [35]:
print(classification_report(y_test, bag_predictions))
print(confusion_matrix(y_test, bag_predictions))

              precision    recall  f1-score   support

    NotPrice       0.95      0.99      0.97      3047
       Price       0.86      0.54      0.67       368

   micro avg       0.94      0.94      0.94      3415
   macro avg       0.90      0.77      0.82      3415
weighted avg       0.94      0.94      0.94      3415

[[3014   33]
 [ 168  200]]


## Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

lr_pipeline= Pipeline([
    ('bow', TfidfVectorizer(analyzer=dummy_analyzer)),
    ('LogisticRegression', LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000))
])

In [29]:
lr_pipeline.fit(X_train, y_train)
lr_predictions= lr_pipeline.predict(X_test)

In [30]:
print(classification_report(y_test, lr_predictions))
print(confusion_matrix(y_test, lr_predictions))

              precision    recall  f1-score   support

    NotPrice       0.96      0.97      0.96      3068
       Price       0.75      0.64      0.69       385

    accuracy                           0.94      3453
   macro avg       0.85      0.81      0.83      3453
weighted avg       0.93      0.94      0.93      3453

[[2985   83]
 [ 138  247]]


## Naive Bayes

In [39]:
from sklearn.naive_bayes import MultinomialNB
nb_pipeline= Pipeline([
    ('bow', CountVectorizer(analyzer=dummy_analyzer)),
    ('MultinomialNB', MultinomialNB(alpha=1))
])

In [40]:
nb_pipeline.fit(X_train, y_train)
nb_predictions= nb_pipeline.predict(X_test)

In [41]:
print(classification_report(y_test, nb_predictions))
print(confusion_matrix(y_test, nb_predictions))

              precision    recall  f1-score   support

    NotPrice       0.99      0.86      0.92      3047
       Price       0.45      0.94      0.61       368

   micro avg       0.87      0.87      0.87      3415
   macro avg       0.72      0.90      0.76      3415
weighted avg       0.93      0.87      0.89      3415

[[2619  428]
 [  22  346]]


## ADA Boost

In [42]:
from sklearn.ensemble import AdaBoostClassifier
ada_pipeline= Pipeline([
    ('bow', CountVectorizer(analyzer=dummy_analyzer)),
    ('AdaBoostClassifier', AdaBoostClassifier(n_estimators=100))
])

In [43]:
ada_pipeline.fit(X_train, y_train)
ada_predictions= ada_pipeline.predict(X_test)

In [44]:
print(classification_report(y_test, ada_predictions))
print(confusion_matrix(y_test, ada_predictions))

              precision    recall  f1-score   support

    NotPrice       0.97      0.98      0.97      3047
       Price       0.80      0.73      0.77       368

   micro avg       0.95      0.95      0.95      3415
   macro avg       0.89      0.86      0.87      3415
weighted avg       0.95      0.95      0.95      3415

[[2981   66]
 [  98  270]]


## Linear Support Vector Machine

In [45]:
from sklearn.svm import LinearSVC

In [46]:
lsvc_pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=dummy_analyzer)),
    ('LinearSVC', LinearSVC(random_state=0, max_iter=10000))
])

In [47]:
lsvc_pipeline.fit(X_train, y_train)
lsvc_predictions= ada_pipeline.predict(X_test)



In [48]:
print(classification_report(y_test, lsvc_predictions))
print(confusion_matrix(y_test, lsvc_predictions))

              precision    recall  f1-score   support

    NotPrice       0.97      0.98      0.97      3047
       Price       0.80      0.73      0.77       368

   micro avg       0.95      0.95      0.95      3415
   macro avg       0.89      0.86      0.87      3415
weighted avg       0.95      0.95      0.95      3415

[[2981   66]
 [  98  270]]


## Light GBM

In [49]:
from xgboost import XGBClassifier

In [50]:
xgb_pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=dummy_analyzer)),
    ('XGBClassifier', XGBClassifier())
    ])

In [51]:
xgb_pipeline.fit(X_train, y_train)
xgb_predictions= xgb_pipeline.predict(X_test)

In [53]:
print(classification_report(y_test, xgb_predictions))
print(confusion_matrix(y_test, xgb_predictions))

              precision    recall  f1-score   support

    NotPrice       0.97      0.98      0.98      3047
       Price       0.85      0.73      0.79       368

   micro avg       0.96      0.96      0.96      3415
   macro avg       0.91      0.86      0.88      3415
weighted avg       0.96      0.96      0.96      3415

[[3000   47]
 [  98  270]]


## Decision Tree

In [31]:
from sklearn.tree import DecisionTreeClassifier
dt_pipeline= Pipeline([
    ('bow', TfidfVectorizer(analyzer=dummy_analyzer)),
    ('DecisionTreeClassifier', DecisionTreeClassifier(random_state=0))
])

In [32]:
dt_pipeline.fit(X_train, y_train)
dt_predictions= dt_pipeline.predict(X_test)

In [33]:
print(classification_report(y_test, dt_predictions))
print(confusion_matrix(y_test, dt_predictions))

              precision    recall  f1-score   support

    NotPrice       0.98      0.99      0.98      3068
       Price       0.89      0.85      0.87       385

    accuracy                           0.97      3453
   macro avg       0.94      0.92      0.93      3453
weighted avg       0.97      0.97      0.97      3453

[[3029   39]
 [  58  327]]


In [34]:
df_t['Prediction'] = xtra_pipeline.predict(df_t['Col_Tokens'])
df_t['Probability'] = np.amax(xtra_pipeline.predict_proba(df_t['Col_Tokens']),1)

In [35]:
df_t[df_t['Label']=='Price']
#df_t[((df_t['Prediction']=='Price')&(df_t['Label']=='Other'))]

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,ColumnValue,DocNumTableNum,DocNumTableNumColumnNumber,Label,PriceInTable,Col_Tokens,Prediction,Probability
1686,Doc1003512767,3,4,"[Price\n(US$/pc), 119.42, 114.42, 153.00, for ...",Doc1003512767/3,Doc1003512767/3/4,Price,Yes,"[price, US, $, /pc, ___.__, ___.__, ___.__, de...",Price,0.74625
3423,Doc1005740068,3,2,"[Price, $31.00, $1.09, $1.07]",Doc1005740068/3,Doc1005740068/3/2,Price,Yes,"[price, $, __.__, $, _.__, $, _.__]",Price,1.00000
4929,Doc1008326585,46,2,"[Price/Unit, $ 4,289.25, $ 1,500.00, Intel Dir...",Doc1008326585/46,Doc1008326585/46/2,Price,Yes,"[price/unit, $, _, ,, ___.__, $, _, ,, ___.__,...",Price,1.00000
4934,Doc1008326585,47,2,"[Price/Unit, $ 6,943.05, Intel Direct Source, ...",Doc1008326585/47,Doc1008326585/47/2,Price,Yes,"[price/unit, $, _, ,, ___.__, intel, direct, s...",Price,1.00000
4942,Doc1008326585,48,3,"[IPN, 6,943.05, 6,536.00, 500241007, 500282670...",Doc1008326585/48,Doc1008326585/48/3,Price,Yes,"[ipn, _, ,, ___.__, _, ,, ___.__, _________, _...",Price,1.00000
...,...,...,...,...,...,...,...,...,...,...,...
749834,Doc975084460,2,16,"[PRICE\n$/UM\nChina to\nTMM, 10.20, 17.55, , 8...",Doc975084460/2,Doc975084460/2/16,Price,Yes,"[price, $, /um, china, tmm, __.__, __.__, __._...",Price,1.00000
750940,Doc977436468,6,7,"[Unit Price (US$), $ 2.250, $\n2.250]",Doc977436468/6,Doc977436468/6/7,Price,Yes,"[unit, price, US, $, $, _.___, $, _.___]",Price,1.00000
751486,Doc979301475,6,7,"[Unit Price (US$), $ 2.250, $\n2.250]",Doc979301475/6,Doc979301475/6/7,Price,Yes,"[unit, price, US, $, $, _.___, $, _.___]",Price,1.00000
752181,Doc981223975,7,7,"[Unit Price (US$), $ 2.250, $\n2.250]",Doc981223975/7,Doc981223975/7/7,Price,Yes,"[unit, price, US, $, $, _.___, $, _.___]",Price,1.00000


In [327]:
df_t.to_csv(r'C:\Users\jrstrong\Desktop\temp\PriceColumns.csv', index=False)

In [37]:
df_t[['DocumentId','TableNumber','ColumnNumber','Label','Prediction','Probability']].to_csv(r'C:\Users\jrstrong\Desktop\temp\PriceColumns_Short.csv', index=False)

In [31]:
# https://towardsdatascience.com/tpot-automated-machine-learning-in-python-4c063b3e5de9
# https://epistasislab.github.io/tpot/api/
import tpot

# scoring='f1_weighted'

In [None]:
def dummy_analyzer(x):
    return x

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
tpot_pipeline = Pipeline([
    ('bow', TfidfVectorizer(analyzer=dummy_analyzer)),
    ('TPOTClassifier', tpot.TPOTClassifier(config_dict='TPOT sparse',
                                           scoring='f1_weighted',
                                           periodic_checkpoint_folder=r'C:\Users\jrstrong\Desktop\temp\TPOTProgress.txt',
                                           n_jobs=-1,
                                           verbosity=2,
                                           generations=10, # 100 is default
                                           population_size=10 # 100 is default
                                          ))
])

tpot_pipeline.fit(X_train, y_train)
tpot_predictions= tpot_pipeline.predict(X_test)

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(y_test, tpot_predictions))
print(confusion_matrix(y_test, tpot_predictions))



HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=110.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: 0.9689731449032767
Generation 2 - Current best internal CV score: 0.9689731449032767
Generation 3 - Current best internal CV score: 0.9689731449032767
Generation 4 - Current best internal CV score: 0.9710885255724057
Generation 5 - Current best internal CV score: 0.9710885255724057
