In [4]:
import pandas as pd
import pyodbc
import numpy as np

In [31]:
# Import data from SQL Server database
sql_conn = pyodbc.connect('DRIVER={SQL Server};\
                            SERVER=sql1944-fm1-in.amr.corp.intel.com,3181;\
                            DATABASE=GSMDW_TST;\
                            Trusted_Connection=yes;\
                            integrated security=true')

query= """select A.spn,A.line_net_prc_USD_amt,S.glbl_busns_org_prty_id as supl_id
from (
SELECT distinct [supl_id],[supl_mtrl_id] as spn,line_net_prc_USD_amt
FROM [gsmdw_tst].[JUSTIN].[CDM_OA]
UNION
SELECT distinct [curr_supl_id],[curr_supl_itm_nbr],[new_buy_prc_usd_amt]
FROM [gsmdw_tst].[JUSTIN].[CDM_WIINGS]
) A
left join [gsmdw_tst].[JUSTIN].[SuplHier] as S
ON (A.supl_id = S.busns_org_prty_id)
where A.supl_id is not null
and A.line_net_prc_USD_amt is not null
and A.spn is not null"""
df_System = pd.read_sql(query, sql_conn)
sql_conn.close()

In [32]:
# Import data from SQL Server database
sql_conn = pyodbc.connect('DRIVER={SQL Server};\
                            SERVER=azscia01.amr.corp.intel.com,1433;\
                            DATABASE=CDM;\
                            Trusted_Connection=yes;\
                            integrated security=true')

query= """SELECT C.[Supplier Id]
      ,T.[DocumentId]
      ,T.[TableNumber]
      ,T.[RowNumber]
      ,T.[ColumnNumber]
      ,T.[ColumnValue]
      ,T.[TableConfidence]
  FROM [CDM].[dbo].[tablesunpivoted] AS T
  JOIN [dbo].[draftpublishedcontracts] AS C
  ON C.[Document Id] = T.DocumentId
  WHERE C.[Doc Status] IN ('Published','Draft')"""
df_Contract = pd.read_sql(query, sql_conn)
sql_conn.close()

In [33]:
# Import data from SQL Server database
sql_conn = pyodbc.connect('DRIVER={SQL Server};\
                            SERVER=sql1944-fm1-in.amr.corp.intel.com,3181;\
                            DATABASE=GSMDW_TST;\
                            Trusted_Connection=yes;\
                            integrated security=true')

query= """select [busns_org_prty_id],[glbl_busns_org_prty_id]
from [gsmdw_tst].[JUSTIN].[SuplHier]"""
df_SuplHier = pd.read_sql(query, sql_conn)
sql_conn.close()

In [35]:
df_System['supl_id'] = df_System['supl_id'].astype(str)
df_SuplHier['busns_org_prty_id'] = df_SuplHier['busns_org_prty_id'].astype(str)

df_Contract = pd.merge(df_Contract,
                       df_SuplHier,
                       left_on=['Supplier Id'],
                       right_on=['busns_org_prty_id'],
                       how='inner')

In [36]:
df_System['supl_id'] = df_System['supl_id'].astype(str)
df_System['spn'] = df_System['spn'].str.strip()
df_Contract['busns_org_prty_id'] = df_Contract['busns_org_prty_id'].astype(str)
df_Contract['ColumnValue'] = df_Contract['ColumnValue'].str.strip()
df_Merge = pd.merge(df_System,
                    df_Contract,
                    left_on=['supl_id','spn'],
                    right_on=['busns_org_prty_id','ColumnValue'],
                    how='inner')

#df_Merge = pd.merge(df_System,
#                    df_Contract,
#                    left_on=['spn'],
#                    right_on=['ColumnValue'],
#                    how='inner')

In [11]:
df_ContractTable = pd.merge(df_Contract,
                            df_Merge,
                            left_on=['DocumentId','TableNumber','RowNumber'],
                            right_on=['DocumentId','TableNumber','RowNumber'],how='left')

In [12]:
def is_number(nbr):
    try:
        float(nbr)
        return True
    except ValueError:
        return False

def clean(unclean):
    unclean = unclean.replace('$', '')
    unclean = unclean.replace(',','')
    unclean = unclean.strip()
    if is_number(unclean):
        unclean = float(unclean)
    return unclean

In [13]:
df_ContractTable['Value_clean'] = df_ContractTable.ColumnValue_x.apply(lambda x : clean(x))

In [14]:
df_ContractTable['Label'] = np.where(df_ContractTable['line_net_prc_USD_amt'].astype(str) == df_ContractTable['Value_clean'].astype(str), 'Price', 'NotPrice')
df_training = df_ContractTable[['DocumentId','TableNumber','ColumnNumber_x','ColumnValue_x','Label']]
df_training.rename(columns={'ColumnNumber_x':'ColumnNumber','ColumnValue_x':'ColumnValue'}, inplace=True)
df_training

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,DocumentId,TableNumber,ColumnNumber,ColumnValue,Label
0,Doc1000015244,0,0,BY:,NotPrice
1,Doc1000015244,0,1,BY:,NotPrice
2,Doc1000015244,0,0,NAME:,NotPrice
3,Doc1000015244,0,1,NAME:,NotPrice
4,Doc1000015244,0,0,TITLE:,NotPrice
5,Doc1000015244,0,1,TITLE:,NotPrice
6,Doc1000015244,1,0,Equipment\nModel,NotPrice
7,Doc1000015244,1,1,Material\nMaster\nNumber,NotPrice
8,Doc1000015244,1,2,Material\nMaster\nDescription,NotPrice
9,Doc1000015244,1,3,CEID,NotPrice


In [15]:
df_filtered = df_training[df_training['Label'] == 'Price']
df_grouped = df_filtered.groupby(['DocumentId','TableNumber','ColumnNumber'])['ColumnNumber'].count().reset_index(name="count")
df_grouped

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,count
0,Doc1200362136,31,0,1
1,Doc1200903348,30,0,1
2,Doc1201585322,28,0,1
3,Doc1603072761,0,2,6
4,Doc1603072761,1,3,2
5,Doc1603072761,3,2,3
6,Doc1603072761,4,2,7
7,Doc1627650987,0,0,1
8,Doc2044033097,2,7,1
9,Doc2044033097,2,8,24


In [16]:
df_grouped = df_grouped.loc[df_grouped.groupby(["DocumentId", "TableNumber"])["count"].idxmax()]
df_grouped['DocNumTableNum'] = df_grouped['DocumentId'].apply(str) + '/' + df_grouped['TableNumber'].apply(str)
df_grouped['DocNumTableNumColumnNumber'] = df_grouped['DocumentId'].apply(str) + '/' + df_grouped['TableNumber'].apply(str) + '/' + df_grouped['ColumnNumber'].apply(str)
df_grouped

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,count,DocNumTableNum,DocNumTableNumColumnNumber
0,Doc1200362136,31,0,1,Doc1200362136/31,Doc1200362136/31/0
1,Doc1200903348,30,0,1,Doc1200903348/30,Doc1200903348/30/0
2,Doc1201585322,28,0,1,Doc1201585322/28,Doc1201585322/28/0
3,Doc1603072761,0,2,6,Doc1603072761/0,Doc1603072761/0/2
4,Doc1603072761,1,3,2,Doc1603072761/1,Doc1603072761/1/3
5,Doc1603072761,3,2,3,Doc1603072761/3,Doc1603072761/3/2
6,Doc1603072761,4,2,7,Doc1603072761/4,Doc1603072761/4/2
7,Doc1627650987,0,0,1,Doc1627650987/0,Doc1627650987/0/0
9,Doc2044033097,2,8,24,Doc2044033097/2,Doc2044033097/2/8
10,Doc2090637515,10,2,1,Doc2090637515/10,Doc2090637515/10/2


In [17]:
df_t = df_training.groupby(['DocumentId','TableNumber','ColumnNumber']).ColumnValue.unique().reset_index()
df_t

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,ColumnValue
0,Doc1000015244,0,0,"[BY:, NAME:, TITLE:]"
1,Doc1000015244,0,1,"[BY:, NAME:, TITLE:]"
2,Doc1000015244,1,0,"[Equipment\nModel, Synergis MX, Notes]"
3,Doc1000015244,1,1,"[Material\nMaster\nNumber, TBD, - Ozone Genera..."
4,Doc1000015244,1,2,"[Material\nMaster\nDescription, TBD]"
5,Doc1000015244,1,3,"[CEID, ABV]"
6,Doc1000015244,1,4,"[P-\nSpec, 06P-\n1944]"
7,Doc1000015244,1,5,"[C-\nSPEC, 06C-\n1698]"
8,Doc1000015244,1,6,[Pricing\nStructureDis\ncount (capital\nspends...
9,Doc1000015244,1,7,"[Contract\nPrice\n(currency\nsymbol), $5,626,0..."


In [18]:
df_t['DocNumTableNum'] = df_t['DocumentId'] + '/' + df_t['TableNumber'].apply(str)
df_t['DocNumTableNumColumnNumber'] = df_t['DocumentId'] + '/' + df_t['TableNumber'].apply(str) + '/' + df_t['ColumnNumber'].apply(str)
df_t['Label'] = np.where(df_t['DocNumTableNumColumnNumber'].isin(df_grouped.DocNumTableNumColumnNumber.tolist()), 'Price', 'Other')
df_t['PriceInTable'] = np.where(df_t['DocNumTableNum'].isin(df_grouped.DocNumTableNum.tolist()), 'Yes','No')
df_t['Label'] = df_t.apply(lambda row: 'NotPrice' if (row['Label']=='Other')&(row['PriceInTable'] == 'Yes' ) else row['Label'],axis=1)
#df_t[df_t['Label']=='Other']
#df_t[(df_t['Label']=='Other')&(df_t['test']=='NotPrice')]
df_t

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,ColumnValue,DocNumTableNum,DocNumTableNumColumnNumber,Label,PriceInTable
0,Doc1000015244,0,0,"[BY:, NAME:, TITLE:]",Doc1000015244/0,Doc1000015244/0/0,Other,No
1,Doc1000015244,0,1,"[BY:, NAME:, TITLE:]",Doc1000015244/0,Doc1000015244/0/1,Other,No
2,Doc1000015244,1,0,"[Equipment\nModel, Synergis MX, Notes]",Doc1000015244/1,Doc1000015244/1/0,Other,No
3,Doc1000015244,1,1,"[Material\nMaster\nNumber, TBD, - Ozone Genera...",Doc1000015244/1,Doc1000015244/1/1,Other,No
4,Doc1000015244,1,2,"[Material\nMaster\nDescription, TBD]",Doc1000015244/1,Doc1000015244/1/2,Other,No
5,Doc1000015244,1,3,"[CEID, ABV]",Doc1000015244/1,Doc1000015244/1/3,Other,No
6,Doc1000015244,1,4,"[P-\nSpec, 06P-\n1944]",Doc1000015244/1,Doc1000015244/1/4,Other,No
7,Doc1000015244,1,5,"[C-\nSPEC, 06C-\n1698]",Doc1000015244/1,Doc1000015244/1/5,Other,No
8,Doc1000015244,1,6,[Pricing\nStructureDis\ncount (capital\nspends...,Doc1000015244/1,Doc1000015244/1/6,Other,No
9,Doc1000015244,1,7,"[Contract\nPrice\n(currency\nsymbol), $5,626,0...",Doc1000015244/1,Doc1000015244/1/7,Other,No


In [19]:
import re
import nltk
#nltk.download('stopwords')

def listToString(s):  
    str1 = " "  
    return str1.join(s)

def remove_numbers(wordtoken):
    return re.sub(r'\d',r'_',wordtoken)

def tokenize_string(strng):
    #if not strng:
    #    print('The text to be tokenized is a None type. Defaulting to blank string.')
    #    strng = ''
    return nltk.word_tokenize(strng)

punct= list('"\'()*`{|}~/')
def remove_punct(wordtokens):
    return [wordtoken for wordtoken in wordtokens if wordtoken not in punct]

df_t['Col_Tokens'] = df_t.ColumnValue.apply(listToString)
df_t['Col_Tokens'] = df_t.Col_Tokens.apply(remove_numbers)
df_t['Col_Tokens'] = df_t.Col_Tokens.apply(tokenize_string)

In [20]:
stopwords= set(nltk.corpus.stopwords.words('english'))
def remove_stopwords(wordtokens):
    return [wordtoken for wordtoken in wordtokens if wordtoken not in stopwords]

# Porter Stem Tokens
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_tokens(wordtokens):
    return [ps.stem(word) for word in wordtokens]

df_t['Col_Tokens'] = df_t.Col_Tokens.apply(remove_stopwords)
df_t['Col_Tokens'] = df_t.Col_Tokens.apply(remove_punct)
df_t['Col_Tokens'] = df_t.Col_Tokens.apply(stem_tokens)

In [21]:
df_tr = df_t[(df_t['Label']=='Price') | (df_t['Label']=='NotPrice')]
df_tr[(df_tr['Label']=='Price')]

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,ColumnValue,DocNumTableNum,DocNumTableNumColumnNumber,Label,PriceInTable,Col_Tokens
111461,Doc1200362136,31,0,"[https-browserify, iconv-lite, immediate, inde...",Doc1200362136/31,Doc1200362136/31/0,Price,Yes,"[https-browserifi, iconv-lit, immedi, indexof,..."
112048,Doc1200903348,30,0,"[http- parse r-js, https-browserify, iconv-lit...",Doc1200903348/30,Doc1200903348/30/0,Price,Yes,"[http-, pars, r-j, https-browserifi, iconv-lit..."
112638,Doc1201585322,28,0,"[http- parse r-js, https-browserify, iconv-lit...",Doc1201585322/28,Doc1201585322/28/0,Price,Yes,"[http-, pars, r-j, https-browserifi, iconv-lit..."
256462,Doc1603072761,0,2,"[, an* aio, ni'an, m>an, m*an, nimpn minx\nno ...",Doc1603072761/0,Doc1603072761/0/2,Price,Yes,"[an*, aio, ni'an, >, m*an, nimpn, minx, \unl, ..."
256470,Doc1603072761,1,3,"[, m*n Jim', I'D, nan, 20 tip - 'n*\n""7, 60 tj...",Doc1603072761/1,Doc1603072761/1/3,Price,Yes,"[m*n, jim, I, 'D, nan, __, tip, -, n*, '', _, ..."
256478,Doc1603072761,3,2,"[aionrnN, man, nan, mmpo niTHN\nno wo/, pina Q...",Doc1603072761/3,Doc1603072761/3/2,Price,Yes,"[aionrnn, man, nan, mmpo, nithn, wo/, pina, q'..."
256481,Doc1603072761,4,2,"[200 - p'uobg nmn\nneb, nmip -looono'b, D'pDpD...",Doc1603072761/4,Doc1603072761/4/2,Price,Yes,"[___, -, p'uobg, nmn, neb, nmip, -looono, b, d..."
259810,Doc1627650987,0,0,[nVan\nn wiAt-;\njvwrin\nDyw ■'\nnViso\n•' 's'...,Doc1627650987/0,Doc1627650987/0/0,Price,Yes,"[nvan, n, wiat-, ;, jvwrin, dyw, ■', nviso, •,..."
362924,Doc2044033097,2,8,"[New $/KG, $1,235, 11.39, 3,465.00, 1,255.50, ...",Doc2044033097/2,Doc2044033097/2/8,Price,Yes,"[new, $, /kg, $, _, ,, ___, __.__, _, ,, ___._..."
376564,Doc2090637515,10,2,"[2019-2020 Price, $ 1,596, $ 1,674, $ 1,500, $...",Doc2090637515/10,Doc2090637515/10/2,Price,Yes,"[____-____, price, $, _, ,, ___, $, _, ,, ___,..."


In [22]:
df_tr_final = df_t[df_t['DocNumTableNum'].isin(df_tr.DocNumTableNum.tolist())]
df_tr_final

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,ColumnValue,DocNumTableNum,DocNumTableNumColumnNumber,Label,PriceInTable,Col_Tokens
111461,Doc1200362136,31,0,"[https-browserify, iconv-lite, immediate, inde...",Doc1200362136/31,Doc1200362136/31/0,Price,Yes,"[https-browserifi, iconv-lit, immedi, indexof,..."
111462,Doc1200362136,31,1,"[1.0.0, 0.4.19, 3.0.6, 0.0.1, 0.6.2, 7.0.1, 1....",Doc1200362136/31,Doc1200362136/31/1,NotPrice,Yes,"[_._._, _._.__, _._._, _._._, _._._, _._._, _...."
111463,Doc1200362136,31,2,"[MIT, (MIT OR Apache-2.0), (MIT OR GPL-3.0), (...",Doc1200362136/31,Doc1200362136/31/2,NotPrice,Yes,"[mit, mit, OR, apache-_._, mit, OR, gpl-_._, m..."
111464,Doc1200362136,31,3,[https://github.com/substack/https-browserify/...,Doc1200362136/31,Doc1200362136/31/3,NotPrice,Yes,"[http, :, //github.com/substack/https-browseri..."
112048,Doc1200903348,30,0,"[http- parse r-js, https-browserify, iconv-lit...",Doc1200903348/30,Doc1200903348/30/0,Price,Yes,"[http-, pars, r-j, https-browserifi, iconv-lit..."
112049,Doc1200903348,30,1,"[0.4.9, 1.0.0, 0.4.19, 3.0.6, 0.0.1, 0.6.2, 7....",Doc1200903348/30,Doc1200903348/30/1,NotPrice,Yes,"[_._._, _._._, _._.__, _._._, _._._, _._._, _...."
112050,Doc1200903348,30,2,"[MIT, (MIT OR Apache-2.0), (MIT OR GPL-3.0), (...",Doc1200903348/30,Doc1200903348/30/2,NotPrice,Yes,"[mit, mit, OR, apache-_._, mit, OR, gpl-_._, m..."
112051,Doc1200903348,30,3,[https://github.com/creationix/http-parser-is/...,Doc1200903348/30,Doc1200903348/30/3,NotPrice,Yes,"[http, :, //github.com/creationix/http-parser-..."
112638,Doc1201585322,28,0,"[http- parse r-js, https-browserify, iconv-lit...",Doc1201585322/28,Doc1201585322/28/0,Price,Yes,"[http-, pars, r-j, https-browserifi, iconv-lit..."
112639,Doc1201585322,28,1,"[0.4.9, 1.0.0, 0.4.19, 3.0.6, 0.0.1, 0.6.2, 7....",Doc1201585322/28,Doc1201585322/28/1,NotPrice,Yes,"[_._._, _._._, _._.__, _._._, _._._, _._._, _...."


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_tr_final.loc[:, 'Col_Tokens'], df_tr_final.loc[:, 'Label'], test_size=0.3, random_state=1)

In [24]:
def dummy_analyzer(x):
    return x

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
rf_pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=dummy_analyzer)),
    ('RandomForestClassifier', RandomForestClassifier(n_estimators=100))
])

rf_pipeline.fit(X_train, y_train)
rf_predictions= rf_pipeline.predict(X_test)

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(y_test, rf_predictions))
print(confusion_matrix(y_test, rf_predictions))

              precision    recall  f1-score   support

    NotPrice       0.96      1.00      0.98        22
       Price       1.00      0.50      0.67         2

   micro avg       0.96      0.96      0.96        24
   macro avg       0.98      0.75      0.82        24
weighted avg       0.96      0.96      0.95        24

[[22  0]
 [ 1  1]]


In [101]:
from sklearn.ensemble import ExtraTreesClassifier
xtra_pipeline= Pipeline([
    ('bow', CountVectorizer(analyzer=dummy_analyzer)),
    ('ExtraTreesClassifier', ExtraTreesClassifier(n_estimators=200))   
])

xtra_pipeline.fit(X_train, y_train)
xtra_predictions= xtra_pipeline.predict(X_test)

print(classification_report(y_test, xtra_predictions))
print(confusion_matrix(y_test, xtra_predictions))

              precision    recall  f1-score   support

    NotPrice       0.96      1.00      0.98        65
       Price       1.00      0.73      0.84        11

   micro avg       0.96      0.96      0.96        76
   macro avg       0.98      0.86      0.91        76
weighted avg       0.96      0.96      0.96        76

[[65  0]
 [ 3  8]]


In [90]:
df_training[df_training['Label']=='Price']

Unnamed: 0,DocumentId,TableNumber,ColumnNumber,ColumnValue,Label
1593976,Doc1200362136,31,0,,Price
1606437,Doc1200903348,30,0,,Price
1612577,Doc1201585322,28,0,,Price
3204847,Doc1603072761,0,2,,Price
3204875,Doc1603072761,0,2,,Price
3204882,Doc1603072761,0,2,,Price
3204889,Doc1603072761,0,2,,Price
3204896,Doc1603072761,0,2,,Price
3204903,Doc1603072761,0,2,,Price
3204937,Doc1603072761,1,3,,Price


In [11]:
df_Merge.to_csv(r'C:\Users\jrstrong\Desktop\temp\df_Merge.csv')

In [5]:
def preprocess(x):   
    df_ContractTable = pd.merge(df_Contract,
                            x,
                            left_on=['DocumentId','TableNumber','RowNumber'],
                            right_on=['DocumentId','TableNumber','RowNumber'],how='left')
    
    df_ContractTable.to_csv(r'C:\Users\jrstrong\Desktop\temp\final.csv', mode='a', header=False, index=False)

reader = pd.read_csv(r'C:\Users\jrstrong\Desktop\temp\df_Merge.csv', chunksize=1000)

for r in reader:
    preprocess(r) 

KeyboardInterrupt: 

In [37]:
df_Merge

Unnamed: 0,spn,line_net_prc_USD_amt,supl_id,Supplier Id,DocumentId,TableNumber,RowNumber,ColumnNumber,ColumnValue,TableConfidence,busns_org_prty_id,glbl_busns_org_prty_id
0,781743-01,8003.00,1000006922.0,1000006922.0,Doc2094002797,1,9,1,781743-01,99,1000006922.0,1.000007e+09
1,781743-01,8003.00,1000006922.0,1000006922.0,Doc2094002799,1,9,1,781743-01,99,1000006922.0,1.000007e+09
2,781743-01,8003.00,1000006922.0,1000006922.0,Doc2094002802,1,9,1,781743-01,99,1000006922.0,1.000007e+09
3,781743-01,8003.00,1000006922.0,1000006922.0,Doc2096128745,1,9,1,781743-01,99,1000006922.0,1.000007e+09
4,781743-01,8599.00,1000006922.0,1000006922.0,Doc2094002797,1,9,1,781743-01,99,1000006922.0,1.000007e+09
5,781743-01,8599.00,1000006922.0,1000006922.0,Doc2094002799,1,9,1,781743-01,99,1000006922.0,1.000007e+09
6,781743-01,8599.00,1000006922.0,1000006922.0,Doc2094002802,1,9,1,781743-01,99,1000006922.0,1.000007e+09
7,781743-01,8599.00,1000006922.0,1000006922.0,Doc2096128745,1,9,1,781743-01,99,1000006922.0,1.000007e+09
8,781743-01,8464.92,1000006922.0,1000006922.0,Doc2094002797,1,9,1,781743-01,99,1000006922.0,1.000007e+09
9,781743-01,8464.92,1000006922.0,1000006922.0,Doc2094002799,1,9,1,781743-01,99,1000006922.0,1.000007e+09
