<a href="https://colab.research.google.com/github/adikul25/Notebooks/blob/main/LangChain_PPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Processing with LangChain — Extracting Data from PowerPoint Presentations**



In [None]:
pip install unstructured langchain python-pptx -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m811.8/811.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.5/421.5 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.7/274.7 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m35.1 

# Importing the required libraries

In [None]:
from langchain_community.document_loaders import UnstructuredPowerPointLoader
import re

  

---


  We create an instance of the UnstructuredPowerPointLoader and load data from a PowerPoint presentation using the load() method.

---



In [None]:
loader = UnstructuredPowerPointLoader("/content/drive/MyDrive/Data/airbnb-Pitch-Deck.pptx")

data = loader.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
data

 ---

 Function to encapsulate functionalities to extract and categorize information from slides. This class utilizes the UnstructuredPowerPointLoader from the langchain library to operate on PowerPoint files


---





In [None]:
class PPTExtraction:
    def __init__(self, file_path):
        self.file_path = file_path
        self.loader = UnstructuredPowerPointLoader(self.file_path, mode="elements")
        self.data = self.loader.load()

    def extract(self):
        slides = []
        current_slide_number = None

        for document in self.data:
            if document.metadata["category"] == "Title":
                slide_number = document.metadata["page_number"]
                if slide_number != current_slide_number:
                    if slide_number == 1:
                        slide = f"Slide {slide_number}:\n\nTitle: {document.page_content}"
                    else:
                        slide = f"Slide {slide_number}:\n\nOutline: {document.page_content}"
                    current_slide_number = slide_number
                else:
                    slide = f"Outline: {document.page_content}"
            elif document.metadata["category"] == "NarrativeText":
                slide = f"Content: {document.page_content}"
            elif document.metadata["category"] == "ListItem":
                slide = f"Content: {document.page_content}"
            elif document.metadata["category"] == "PageBreak":
                slide = ""
                current_slide_number = None
            else:
                continue

            slides.append(slide)

        formatted_slides = "\n\n".join(slides)
        return formatted_slides

    def extract_and_categorize_links(self):

      extracted_text = self.extract()


      pattern = r'https?://\S+'
      links = re.findall(pattern, extracted_text, re.IGNORECASE)

      categories = {
          'social_media': ['instagram', 'linkedin'],
          'google': ['docs.google.com', 'drive.google.com'],
          'onedrive': ['1drv.ms'],
          'miscellaneous': []
      }
      categorized_links = {}

      for link in links:
          match = re.search(r'://([^/]+)', link)
          if match:
              domain = match.group(1)
              found_category = False
              for category, domains in categories.items():
                  if any(domain.lower().find(d.lower()) != -1 for d in domains):
                      categorized_links.setdefault(category, []).append(link)
                      found_category = True
                      break
              if not found_category:
                  categorized_links.setdefault('miscellaneous', []).append(link)
          else:
              categorized_links.setdefault('miscellaneous', []).append(link)

      return categorized_links




---


Example Usage

---



In [None]:
extract = PPTExtraction('/content/drive/MyDrive/Data/airbnb-Pitch-Deck.pptx')
text1 = extract.extract()
links = extract.extract_and_categorize_links()

In [None]:
print(text1)

Slide 1:

Title: AirBed&Breakfast

Outline: Book Rooms With Locals, Rather than Hotels.



Slide 2:

Outline: Problem

Outline: No Easy Way Exists 

Outline: Price

Outline: Hotels

Content: To book a room with a local  or become a host.

Content: Is an important concern for customers  booking travel online.

Content: Leave you disconnected from the city  and its culture.

Outline: presentationstemplate.com



Slide 3:

Outline: Save Money

Outline: Solution

Content: When traveling

Content: A web platform   where users can rent out their  space to host travelers to:

Outline: Make  Money

Outline: When Hosting

Outline: Share  Culture

Outline: Local Connection To The City

Outline: presentationstemplate.com



Slide 4:

Outline: Solution

Content: A web platform   where users can rent out their  space to host travelers to:

Outline: Share  Culture

Outline: Make  Money

Outline: Save Money

Outline: Local Connection To The City

Content: When traveling

Outline: When Hosting

Outlin

In [None]:
def fetch_links(text):
    pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    links = re.findall(pattern, text, re.IGNORECASE)
    return links

In [None]:
fetch_links(text1)

['https://docs.google.com/forms/d/1-Z5g0HUr9aJM0mXrvIycZ8xCn1B1_k80r2YxN1KW0Do/viewanalytics',
 'https://cullanjasper.wixsite.com/my-site',
 'https://www.linkedin.com/in/happy-hands-a97386274/',
 'https://instagram.com/happy_hands8383?igshid=ZDdkNTZiNTM=',
 'https://docs.google.com/spreadsheets/d/1AjayPpWRI5E2ylnBo9T9Aw85nt42ijKC-CP-mPJJw4s/edit?usp=sharing',
 'https://1drv.ms/x/s!Any9qpg6BSL-gSOeDsXbS2mgADlu']

In [None]:
def extract_and_categorize_links(text):
    pattern = r'https?://\S+'
    links = re.findall(pattern, text, re.IGNORECASE)

    categories = {
        'social_media': ['instagram', 'linkedin'],
        'google': ['docs.google.com', 'drive.google.com'],
        'onedrive': ['1drv.ms'],
        'miscellaneous': []
    }
    categorized_links = {}

    for link in links:
        match = re.search(r'://([^/]+)', link)
        if match:
            domain = match.group(1)
            found_category = False
            for category, domains in categories.items():
                if any(domain.lower().find(d.lower()) != -1 for d in domains):
                    categorized_links.setdefault(category, []).append(link)
                    found_category = True
                    break
            if not found_category:
                categorized_links.setdefault('miscellaneous', []).append(link)
        else:
            categorized_links.setdefault('miscellaneous', []).append(link)

    return categorized_links

extract_and_categorize_links(text1)

{'google': ['https://docs.google.com/forms/d/1-Z5g0HUr9aJM0mXrvIycZ8xCn1B1_k80r2YxN1KW0Do/viewanalytics',
  'https://docs.google.com/spreadsheets/d/1AjayPpWRI5E2ylnBo9T9Aw85nt42ijKC-CP-mPJJw4s/edit?usp=sharing'],
 'miscellaneous': ['https://cullanjasper.wixsite.com/my-site'],
 'social_media': ['https://www.linkedin.com/in/happy-hands-a97386274/',
  'https://instagram.com/happy_hands8383?igshid=ZDdkNTZiNTM='],
 'onedrive': ['https://1drv.ms/x/s!Any9qpg6BSL-gSOeDsXbS2mgADlu']}