In [1]:
import pandas as pd
import numpy as np
import os
import re
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

import docx

In [2]:
#SETUP
ALL_FILES = ["./data_sample/" + f for f in os.listdir("./data_sample/")]
CHUNK_SIZES = [1000] #[50, 100, 200, 500, 750, 1000]
CHUNK_OVERLAPS = [0]
DOCUMENT_PART_LEN_LIMIT = 500
FILE_READING = 'Text' #Text, Chapter, Paragraph

In [24]:
#READ FULL TEXT
def _clean_text(text):
    return re.sub(r'\n+', '\n', text).strip()

def process_paragraph(paragraph):
    return _clean_text(paragraph.text)

def get_full_text_from_paragraphs(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(process_paragraph(para))
    return '\n'.join(fullText)

#IN PROGRESS - the different schemas of the tables to be handled
def process_table_row_as_paragraphs_with_headers(table):
    df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
    for i, row in enumerate(table.rows):
        for j, cell in enumerate(row.cells):
            if cell.text:
                df[i][j] = _clean_text(cell.text)
    col_index=0
    t = pd.DataFrame(df[(col_index+1):], columns = df[col_index]).replace('', np.nan).dropna(how='all').replace(np.nan, '').reset_index(drop=True)
    x = ["; ".join([": ".join([name,desc]) for name, desc in row.items()]) for row in t.to_dict('records')]
    return x

def process_table_as_text(table):
    text = ""
    for i, row in enumerate(table.rows):
        for j, cell in enumerate(row.cells):
            if cell.text:
                text = text + "; " + _clean_text(cell.text)
    return text

def process_single_file_raw_text(filename):
    doc = docx.Document(filename)
    short_filename = filename.split("/")[-1].replace(".docx","")
    full_document = []
    for elem in doc.iter_inner_content():
        if isinstance(elem, docx.table.Table):
            full_document.append(process_table_as_text(elem))
        elif isinstance(elem, docx.text.paragraph.Paragraph):
            full_document.append(process_paragraph(elem))
    return (short_filename, "N/A", " ".join(full_document))

def process_single_file_chapters(filename):
    doc = docx.Document(filename)
    short_filename = filename.split("/")[-1].replace(".docx","")
    full_document = []
    current_chapter = []
    current_chapter_name = ""
    for elem in doc.iter_inner_content():
        if isinstance(elem, docx.table.Table):
            current_chapter.append(process_table_as_text(elem))
        elif isinstance(elem, docx.text.paragraph.Paragraph):
            paragraph_text = process_paragraph(elem)
            if len(paragraph_text) == 0:
                continue
            if elem.style.name.startswith("Heading"):
                if len(current_chapter) > 0:
                    full_document.append((short_filename, current_chapter_name, " ".join(current_chapter)))
                current_chapter = [paragraph_text]
                current_chapter_name = paragraph_text
            else:
                current_chapter.append(paragraph_text)
    return full_document

def process_single_file_paragraphs(filename):
    doc = docx.Document(filename)
    short_filename = filename.split("/")[-1].replace(".docx","")
    full_document = []
    current_chapter_name = "Metadata and table of contents"
    for elem in doc.iter_inner_content():
        if isinstance(elem, docx.table.Table):
            full_document.append((short_filename, current_chapter_name, process_table_as_text(elem)))
        elif isinstance(elem, docx.text.paragraph.Paragraph):
            paragraph_text = process_paragraph(elem)
            if len(paragraph_text) == 0:
                continue
            if elem.style.name.startswith("Heading"):
                if len(paragraph_text) > 0:
                    full_document.append((short_filename, current_chapter_name, paragraph_text))
                current_chapter_name = paragraph_text
            else:
                full_document.append((short_filename, current_chapter_name, paragraph_text))
    return full_document

In [4]:
def read_all_files():
    documents = []
    if FILE_READING == 'Text':
        read_func = process_single_file_raw_text
    elif FILE_READING == "Chapter":
        read_func = process_single_file_chapters
    elif FILE_READING == 'Paragraph':
        read_func = process_single_file_paragraphs
    else:
        print("ERROR: Wrong preprocessing method")
        return None

    for file in ALL_FILES:
        doc_text = read_func(file)
        if len(doc_text) == 0:
            continue
        documents.extend(doc_text)
    return documents

In [5]:
def chunk_documents(docs, chunk_size=100, chunk_overlap=0, limit=0):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap  = chunk_overlap,
        length_function = len,
        add_start_index = True,
    )
    docs = [d for d in docs if len(d[2])>=limit]
    texts = text_splitter.create_documents(texts = [c[2] for c in docs], metadatas = [{'filename': _filename, 'chapter':_chapter} for _filename, _chapter, _ in docs])
    return texts

In [6]:
def create_single_index(texts, name):
    embeddings = SentenceTransformerEmbeddings(model_name="bert-large-nli-mean-tokens")
    db = FAISS.from_documents(texts, embeddings)
    db.save_local(f"./vectorstore_test/{name}")
    db.serialize_to_bytes()
    return db

In [7]:
def create_indexes():
    documents = read_all_files()
    for chunk_size in CHUNK_SIZES:
        for chunk_overlap in CHUNK_OVERLAPS:
            if chunk_overlap >= chunk_size:
                continue
            texts = chunk_documents(documents, chunk_size, chunk_overlap, DOCUMENT_PART_LEN_LIMIT)
            db = create_single_index(texts, f"{FILE_READING}_limit{DOCUMENT_PART_LEN_LIMIT}_chunk{chunk_size}_overlap{chunk_overlap}")

In [8]:
create_indexes()

  from .autonotebook import tqdm as notebook_tqdm


IndexError: list index out of range

In [10]:
documents = read_all_files()

In [11]:
texts = chunk_documents(documents, 1000, 0, DOCUMENT_PART_LEN_LIMIT)

In [None]:
def chunk_documents(docs, chunk_size=100, chunk_overlap=0, limit=0):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap  = chunk_overlap,
        length_function = len,
        add_start_index = True,
    )
    docs = [d for d in docs if len(d[2])>=limit]
    texts = text_splitter.create_documents(texts = [c[2] for c in docs], metadatas = [{'filename': _filename, 'chapter':_chapter} for _filename, _chapter, _ in docs])
    return texts

In [12]:
texts

[]

In [15]:
docs = [d for d in documents if len(d[2])>=500]

In [22]:
FILE_READING = 'Paragraph' #Paragraph

In [25]:
read_all_files()

TypeError: sequence item 0: expected str instance, int found

In [19]:
documents

['SD.00014',
 'N/A',
 '             ; HARMONIZED DOCUMENT          ; Change level; Date; Description of change; -; 02-OCT-2015; Initial release.; A; 28-JUN-2016; Typos corrected form original release.  Added note to section 10.2 regarding EMEA reference to 80026.  Added Section 13, clearance requirements for Solid Axle suspensions.  Co-author changed to Alessandro Messina.; B; 14-MAR-2017; See Change Log table; C; 16-JUN-2017; Harmonization changes – revised and re-released         SUSPENSION DESIGN CLEARANCE GUIDELINES   This document is the FCA harmonized version of SD-11877.  The clearance values listed in this document are derived from vehicle testing and/or historical data and should be used as a general guideline for initial packaging of suspension components.  Clearance conditions that are less than target should be evaluated on a vehicle-by-vehicle basis taking into account statistical build variation, load based deflection capabilities, and weather the position of the suspensi