In [None]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.10.2-py3-none-any.whl (47 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/47.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.5/47.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20221105 (from pdfplumber)
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.20.0-py3-none-manylinux_2_17_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
Successfully installed pdfminer.six-20221105 pdfplumber-0.10.2 pypdfium2-4.20.0


In [None]:
import pdfplumber

In [None]:
import os

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Input, Concatenate, Flatten
from sklearn.model_selection import train_test_split

In [None]:
pdf_folder= '/content/drive/MyDrive/invoices'

In [None]:
def clean_text(text):
  text = text.replace("\n", " ").strip()
  return text

In [None]:
def extract_text(pdf_path):
  text=""
  with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
      page_text = page.extract_text()
      if "terms" in page_text.lower():
        text +=page_text
        break
      text += page_text
  processed_text = clean_text(text)
  return text

In [None]:
text_extracted =[]
for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, filename)
        text_ex = extract_text(pdf_path)
        text_extracted.append(text_ex)

In [None]:
text_extracted

['TAX INVOICE\nOriginal for Recipient\nSAP India Private Limited\n50, Mohebewala Industrial Area Saharanpur\nDEHRADUN 248002\nUttarakhand INDIA\nSAVIC Technologies Private Limited\nCIN : U72200KA1996PTC020063\n803, B-Wing, 8th Floor, Great Eastern Summit,\nPAN: AACCS7483E\nPlot No. 66, Sector-15, C.B.D.Belapur\nGSTIN : 05AACCS7483E1ZB\nNAVI MUMBAI 400614\nMaharashtra INDIA\nPlace of Supply: Maharashtra\nSAVIC Technologies Private Limited\n803, B-Wing, 8th Floor, Great Eastern Summit,\nNavi Mumbai400614\nMaharashtra India\nIRN No: 5a509f5fb0714793da6ec6f4518b9d692bcbc17b8da2943a9480458247d1e3e7\nInvoice No. 6071278484 of 31.12.2022 5,619,047.62 INR\nCustomer GSTIN: 27AAWCS9531R1Z1\nCustomer PAN:\nReverse Charge: Yes / No\nPayment Mode: Bank\nContract Number: 6265416\nOrder No.: 1000155387\nCMS Case ID 3062264234\nLicensee : 996343, A.V.THOMAS & CO. LTD., , Cochin, India\nIn accordance with the above order, we invoice you for SAP Standard software\nItem Description Qty Amount\n__________

In [None]:
structured_text = [
    "Invoice :C240397173 Invoice Date:15.08.2023 Bill-To: SAVIC TECHNOLOGIES PRIVATE LIMITED PLOT NO 66, OFFICE NO B-803-806, GREAT EASTERN SUMMIT-B, SECTOR-15, CBD BELAPUR, NAVI MUMBAI MUMBAI-400614 Maharashtra Total:1,070.64 ",
    "Invoice :FY2324EI15880 Invoice Date:31/07/2023 Bill-To: M/S Savic Technologies Private Limited Attn: venu r B-803, The great eastern summit Sec-15 Navi Mumbai, Maharashtra, 400614 India Total:40,750.12 ",
    "Invoice :6071278484 Invoice Date:31.12.2022 Bill-To: SAVIC Technologies Private Limited 803, B-Wing, 8th Floor, Great Eastern Summit, Plot No. 66, Sector-15, C.B.D.Belapur NAVI MUMBAI 400614 Maharashtra INDIA Total:5,619,047.62 ",
    "Invoice :WS2310 Invoice Date:10.07.2023 Bill-To: SAVIC Technologies Pvt Ltd Address: 707, B-wing, 7th floor, Great Eastern  Summit, Plot No. 66, Sector 15, CBD Belapur, Navi  Mumbai – 400614  PAN: AAWCS9531R  GST No: 27AAWCS9531R1Z1 Total:1,88,800.00 ",
]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_extracted + structured_text)
text_sequences = tokenizer.texts_to_sequences(text_extracted)
structured_sequences = tokenizer.texts_to_sequences(structured_text)
X = pad_sequences(text_sequences)
y = pad_sequences(structured_sequences)

In [None]:
input_layer = Input(shape=(X.shape[1],))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input_layer)
bi_lstm_output = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
flattened_bi_lstm_output = Flatten()(bi_lstm_output)
output_layer = Dense(43, activation='softmax')(flattened_bi_lstm_output)
model = Model(inputs=input_layer, outputs=output_layer)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.fit(X, y, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78b46f634eb0>

In [None]:
pdf_path_test = "/content/drive/MyDrive/invoice test/Sample Invoice (5).pdf"

In [None]:
new_text = []
text_ex = extract_text(pdf_path)
new_text.append(text_ex)

In [None]:
new_text_sequences = tokenizer.texts_to_sequences(new_text)
new_X = pad_sequences(new_text_sequences, maxlen=X.shape[1])
predicted_structured_sequences = model.predict(new_X)



In [None]:
predicted_structured_data = [tokenizer.sequences_to_texts([seq])[0] for seq in predicted_structured_sequences]

In [None]:
predicted_structured_data

["TAX INVOICE\nORIGINAL FOR RECEIPENT\nESDS Software Solution Ltd.\nAddress: Plot No. B- 24 & 25, NICE Industrial Area, Satpur MIDC,\nNashik 422 007.\nState : Maharashtra, State Code : 27\nTax Invoice No: FY2324EI15880\nReference No: PO No. 3000045 Dt.29/10/2021\nTax Invoice Date: 31/07/2023\nDue Date: 05/08/2023\nInvoice Status: UNPAID\nProforma Invoice No: E123093\nUNPAID\nInvoiced To: Pay To:\nM/S Savic Technologies Private Limited Bank Name : Axis Bank Ltd.\nAttn: venu r A/C Name : ESDS Software Solution Ltd.\nB-803, The great eastern summit Bank A/C Number : 918030109668405\nSec-15 IFS Code : UTIB0001636\nNavi Mumbai, Maharashtra, 400614 (cid:13)(cid:10)SWIFT Code : AXISINBBA07\nIndia (cid:13)(cid:10)Bank Address : 214, 215 City Mall, 2nd Floor, Plot No 1,\n(cid:13)(cid:10)Ganesh Khind (University) Road, Pune 411007\nPAN : AABCE4981A GSTIN : 27AABCE4981A1ZV\nPAN : AAWCS9531R GSTIN : 27AAWCS9531R1Z1\nDescription Amount\nManaged Hosting Services : (06/06/2023 - 05/07/2023)(cid:13) R