### Word document processing 

In [9]:
from docx import Document as DocxDocument
from docx2txt import process as docx2txt_process
import os

from langchain_community.document_loaders import Docx2txtLoader, UnstructuredWordDocumentLoader


In [None]:
##Method 1: Using `Docx2txtLoader` from LangChain Community
print("Using Docx2txtLoader from LangChain Community")

try:
    docx_loader = Docx2txtLoader("data/word_files/SumanthAnem-WellsFargo.docx")
    docx_documents = docx_loader.load()
    print(f"Loaded {len(docx_documents)} documents using Docx2txtLoader")
    print(docx_documents[0].page_content[:500])  # Print first 500 characters of the first document
    print(docx_documents[0].metadata)  # Print metadata of the first document
    print("Document loaded successfully with Docx2txtLoader")
except Exception as e:
    print(f"Error loading documents with Docx2txtLoader: {e}")

Using DocxLoader from LangChain Community
Loaded 1 documents using Docx2txtLoader
Sumanth Anem



anem.sumanth@gmail.com

+91 88016 25499

   linkedin.com/in/anemsumanth





Summary



Over 2.10 years of experience in systems administration, ITIL process, CI/CD implementation, AWS cloud, software development, Application Production Support, Monitoring and maintaining large-scale services with various underlying dependencies



Technical skills:



	Cloud computing: AWS

	Application containers: Docker

	Version control system: GIT

	Continuous Integration / Continuous Delive
{'source': 'data/word_files/SumanthAnem-WellsFargo.docx'}
Document loaded successfully with Docx2txtLoader


In [17]:
from docx import Document as DocxDocument
from docx2txt import process as docx2txt_process
import os

from langchain_community.document_loaders import Docx2txtLoader, UnstructuredWordDocumentLoader

try:
    # Method 2: Using UnstructuredWordDocumentLoader from LangChain Community
    print("Using UnstructuredWordDocumentLoader from LangChain Community")
    
    unstructured_loader = UnstructuredWordDocumentLoader("data/word_files/SumanthAnem-WellsFargo.docx")
    unstructured_documents = unstructured_loader.load()
    print(f"Loaded {len(unstructured_documents)} documents using UnstructuredWordDocumentLoader")
    print(unstructured_documents[0].page_content[:500])  # Print first 500 characters of the first document
    print(unstructured_documents[0].metadata)  # Print metadata of the first document
except Exception as e:
    print(f"Error loading documents with UnstructuredWordDocumentLoader: {e}")

print("Document loaded successfully with UnstructuredWordDocumentLoader")



Using UnstructuredWordDocumentLoader from LangChain Community
Loaded 1 documents using UnstructuredWordDocumentLoader
Sumanth Anem

anem.sumanth@gmail.com +91 88016 25499

   linkedin.com/in/anemsumanth

Summary

Over 2.10 years of experience in systems administration, ITIL process, CI/CD implementation, AWS cloud, software development, Application Production Support, Monitoring and maintaining large-scale services with various underlying dependencies

Technical skills:

Cloud computing: AWS

Application containers: Docker

Version control system: GIT

Continuous Integration / Continuous Delivery processes and 
{'source': 'data/word_files/SumanthAnem-WellsFargo.docx'}
Document loaded successfully with UnstructuredWordDocumentLoader


In [19]:


# Method 3: Using UnstructuredWordDocumentLoader with mode="elements"
print("Using UnstructuredWordDocumentLoader with mode='elements'")  
try:
    unstructured_elements_loader = UnstructuredWordDocumentLoader("data/word_files/SumanthAnem-WellsFargo.docx", mode="elements")
    unstructured_elements_documents = unstructured_elements_loader.load()
    print(f"Loaded {len(unstructured_elements_documents)} documents using UnstructuredWordDocumentLoader with mode='elements'")
    print(unstructured_elements_documents[0].page_content[:500])  # Print first 500 characters of the first document
    print(unstructured_elements_documents[0].metadata)  # Print metadata of the first document
    for element, doc in enumerate(unstructured_elements_documents[:3]):
            print(f"Element {element+1}:")
            print(f"Element type: {doc.metadata.get('category','unknown')}")
            print(f"Content: {doc.page_content[:500]}")
except Exception as e:
    print(f"Error loading documents with UnstructuredWordDocumentLoader with mode='elements': {e}")
print("Document loaded successfully with UnstructuredWordDocumentLoader with mode='elements'")


Using UnstructuredWordDocumentLoader with mode='elements'
Loaded 59 documents using UnstructuredWordDocumentLoader with mode='elements'
Sumanth Anem
{'source': 'data/word_files/SumanthAnem-WellsFargo.docx', 'category_depth': 0, 'file_directory': 'data/word_files', 'filename': 'SumanthAnem-WellsFargo.docx', 'last_modified': '2020-02-12T00:14:18', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': '7b7546070b4bfa18356be009d3d439e0'}
Element 1:
Element type: Title
Content: Sumanth Anem
Element 2:
Element type: Table
Content: anem.sumanth@gmail.com +91 88016 25499
Element 3:
Element type: Title
Content:    linkedin.com/in/anemsumanth
Document loaded successfully with UnstructuredWordDocumentLoader with mode='elements'


In [18]:

# Method 4: Using UnstructuredWordDocumentLoader with mode="elements" and additional parameters
print("Using UnstructuredWordDocumentLoader with mode='elements' and additional parameters")
try:
    unstructured_elements_loader = UnstructuredWordDocumentLoader(
        "data/word_files/SumanthAnem-WellsFargo.docx",
        mode="elements",
        additional_kwargs={"include_metadata": True, "include_text": True}
    )
    unstructured_elements_documents = unstructured_elements_loader.load()
    print(f"Loaded {len(unstructured_elements_documents)} documents using UnstructuredWordDocumentLoader with mode='elements' and additional parameters")
    print(unstructured_elements_documents[0].page_content[:500])  # Print first 500 characters of the first document
    print(unstructured_elements_documents[0].metadata)  # Print metadata of the first document
except Exception as e:
    print(f"Error loading documents with UnstructuredWordDocumentLoader with mode='elements' and additional parameters: {e}")
print("Document loaded successfully with UnstructuredWordDocumentLoader with mode='elements' and additional parameters")




Using UnstructuredWordDocumentLoader with mode='elements' and additional parameters
Loaded 59 documents using UnstructuredWordDocumentLoader with mode='elements' and additional parameters
Sumanth Anem
{'source': 'data/word_files/SumanthAnem-WellsFargo.docx', 'category_depth': 0, 'file_directory': 'data/word_files', 'filename': 'SumanthAnem-WellsFargo.docx', 'last_modified': '2020-02-12T00:14:18', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': '7b7546070b4bfa18356be009d3d439e0'}
Document loaded successfully with UnstructuredWordDocumentLoader with mode='elements' and additional parameters
