In [1]:
import bs4
import langchain_community

In [5]:
from langchain_community.document_loaders import TextLoader as CommunityTextLoader

In [10]:
import PyPDF2

In [26]:
import requests
import re

In [8]:


# load Meeting trasncript.txt into `data` using a langchain_community loader if available,
# otherwise fall back to LangChain's TextLoader or a manual Document wrapper.

try:
    # try community loader
    loader = CommunityTextLoader("Meeting transcript.txt")
    data = loader.load()
except Exception:
    try:
        # fall back to built-in LangChain loader
        loader = TextLoader("Meeting transcript.txt")
        data = loader.load()
    except Exception:
        # final fallback: read file manually and wrap as a LangChain Document
        with open("Meeting transcript.txt", "r", encoding="utf-8") as f:
            content = f.read()
        data = [Document(page_content=content, metadata={"source": "Meeting transcript.txt"})]

# quick sanity check
print(f"Loaded {len(data)} document(s).")
print(data[0].page_content[:600])

Loaded 1 document(s).
mode now. No, it's just a PowerPoint slides.
0:08
Not yet. I think there's a separate screen. So if you try to share again,
0:12
Eric, there will be a separate screen. That's more
0:15
That's what I feared. Yeah. Okay. Um so I will just keep
0:23
sharing the the present not not the presenter presenter view but just the
0:28
regular one. Um so okay um I put together this to kind of go over um the
0:35
larger plan uh when I talked with uh JP about the best project and I think
0:40
we had all talked about that um we had settled on the
0:46
um SDSQC AI for the project to work with your
0:52
team o


In [23]:
# Load datap.pdf into `data_pdf`.
pdf_path = "datap2.pdf"
data_pdf = None

# Try community loader candidates (if present in langchain_community.document_loaders)
for candidate in ("PyPDFLoader", "PDFMinerLoader", "UnstructuredPDFLoader", "PDFLoader"):
    loader_cls = getattr(langchain_community.document_loaders, candidate, None)
    if loader_cls:
        try:
            data_pdf = loader_cls(pdf_path).load()
            break
        except Exception:
            data_pdf = None

# Fallback to LangChain's loaders
if not data_pdf:
    try:
        data_pdf = PyPDFLoader(pdf_path).load()
    except Exception:
        try:
            data_pdf = UnstructuredPDFLoader(pdf_path).load()
        except Exception:
            # Final fallback: extract text with PyPDF2
            try:
                with open(pdf_path, "rb") as f:
                    reader = PyPDF2.PdfReader(f)
                    texts = [p.extract_text() or "" for p in reader.pages]
                content = "\n".join(texts)
                data_pdf = [Document(page_content=content, metadata={"source": pdf_path})]
            except Exception as e:
                raise RuntimeError(f"Could not load PDF {pdf_path}: {e}")

print(f"Loaded {len(data_pdf)} document(s) from {pdf_path}.")
print(data_pdf[0].page_content[:1000])

Loaded 2 document(s) from datap2.pdf.
UNITED STATES DISTRICT COURT 
SOUTHERN DISTRICT OF FLORIDA 
MIAMI DIVISION 
CAS
E NO.: 23-cv-22148-GAYLES/LOUIS 
ROSH CHODESH II LIMITED 
PARTNERSHIP, et al., 
P
laintiffs, 
v.
 
JA
N S. WIMPFHEIMER, et al., 
D
efendants.   
_____________________________/            
O
RDER 
T
HIS CAUSE comes  before the Court on the Motion to Dismiss the Second Amended 
Complaint filed by Defendants Madison Gold LLC, Jan S. Wimpfheimer, and Schwell 
Wimpfheimer & Associates, LLP, [ECF No. 72] (“Madison Gold’s Motion to Dismiss”) and the 
Motion to Dismiss the Second Amended Complaint filed by Defendants East Hudson Capital, LLC 
and White Road Capital LLC [ECF No. 71] (“East Hudson’s Motion to Dismiss”). This case was 
referred to Magistrate Judge Lauren F. Louis  for a ruling on all pretrial non- dispositive matters 
and a report and recommendation on all dispositive matters, pursuant to 28 U.S.C. § 636(b)(1)(B). 
[ECF No. 48]. On February 4, 2025, she  issued he

In [None]:
print(f"Loaded {len(data_pdf)} document(s) from {pdf_path}.")
print(data_pdf[1].page_content)

Loaded 23 document(s) from datap.pdf.
Case 2:24-cv-08076-CAS-SSC     Document 28     Filed 02/24/25     Page 2 of 23   Page ID
#:791


In [25]:
print(data_pdf[0].page_content)

UNITED STATES DISTRICT COURT 
SOUTHERN DISTRICT OF FLORIDA 
MIAMI DIVISION 
CAS
E NO.: 23-cv-22148-GAYLES/LOUIS 
ROSH CHODESH II LIMITED 
PARTNERSHIP, et al., 
P
laintiffs, 
v.
 
JA
N S. WIMPFHEIMER, et al., 
D
efendants.   
_____________________________/            
O
RDER 
T
HIS CAUSE comes  before the Court on the Motion to Dismiss the Second Amended 
Complaint filed by Defendants Madison Gold LLC, Jan S. Wimpfheimer, and Schwell 
Wimpfheimer & Associates, LLP, [ECF No. 72] (“Madison Gold’s Motion to Dismiss”) and the 
Motion to Dismiss the Second Amended Complaint filed by Defendants East Hudson Capital, LLC 
and White Road Capital LLC [ECF No. 71] (“East Hudson’s Motion to Dismiss”). This case was 
referred to Magistrate Judge Lauren F. Louis  for a ruling on all pretrial non- dispositive matters 
and a report and recommendation on all dispositive matters, pursuant to 28 U.S.C. § 636(b)(1)(B). 
[ECF No. 48]. On February 4, 2025, she  issued her report recommending that  Madison Go

In [34]:
from langchain_community.document_loaders import WebBaseLoader as LCWebBaseLoader


# Load https://ecourts.gov.in/ecourts_home/ into `data_web`, preferring community loaders, then langchain loaders,
# then falling back to requests + BeautifulSoup if necessary.

url = "https://arxiv.org/abs/1706.03762"
data_web = None

# Try community loader candidates (if present in langchain_community.document_loaders)
for candidate in ("PlaywrightURLLoader", "SeleniumURLLoader", "WebBaseLoader", "UnstructuredURLLoader", "BeautifulSoupURLLoader"):
    loader_cls = getattr(langchain_community.document_loaders, candidate, None)
    if loader_cls:
        try:
            # some loaders accept a single URL (string) or a list; try both forms gracefully
            try:
                data_web = loader_cls([url]).load()
            except Exception:
                data_web = loader_cls(url).load()
            break
        except Exception:
            data_web = None

# Fallback to LangChain's URL loader(s)
if not data_web:
    try:
        # Try built-in LangChain WebBaseLoader if available
        data_web = LCWebBaseLoader(url).load()
    except Exception:
        data_web = None

# Final fallback: requests + BeautifulSoup
if not data_web:

    resp = requests.get(url, timeout=20)
    resp.raise_for_status()

    soup = bs4.BeautifulSoup(resp.text, "html.parser")
    for s in soup(["script", "style", "noscript", "iframe", "header", "footer", "nav"]):
        s.decompose()

    text = soup.get_text(separator="\n")
    # collapse multiple blank lines and trim
    text = re.sub(r"\n\s*\n+", "\n\n", text).strip()
    data_web = [Document(page_content=text, metadata={"source": url})]

print(f"Loaded {len(data_web)} document(s) from {url}")
print(data_web[0].page_content[:1000])

Loaded 1 document(s) from https://arxiv.org/abs/1706.03762


 [1706.03762] Attention Is All You Need


































  











Happy Open Access Week from arXiv!
YOU make open access possible! Tell us why you support #openaccess and give to arXiv this week to help keep science open for all.


Donate!





Skip to main content






We gratefully acknowledge support from the Simons Foundation, member institutions, and all contributors.
Donate





 > cs > arXiv:1706.03762
  







Help | Advanced Search




All fields
Title
Author
Abstract
Comments
Journal reference
ACM classification
MSC classification
Report number
arXiv identifier
DOI
ORCID
arXiv author ID
Help pages
Full text




Search















open search






GO



open navigation menu


quick links

Login
Help Pages
About












Computer Science > Computation and Language


arXiv:1706.03762 (cs)
    




  [Submitted on 12 Jun 2017 (v1), last revised 2 Aug 2023 (this version, v7)]
Title:Attenti

In [37]:
# Extract only the abstract section from whatever was loaded into `data_web`.
# This will work whether `data_web[0].page_content` is raw HTML or plain text.
if not data_web:
    raise RuntimeError("data_web is empty; nothing to extract from.")

content = data_web[0].page_content or ""

# Try HTML parsing first (blockquotes with class `abstract mathjax` on arXiv)
abstract_text = ""
if "<" in content and ">" in content:
    soup = bs4.BeautifulSoup(content, "html.parser")
    block = soup.find("blockquote", class_="abstract mathjax") or soup.find(class_="abstract mathjax")
    if block:
        abstract_text = block.get_text(separator="\n").strip()
        abstract_text = re.sub(r"^\s*Abstract[:\s]*", "", abstract_text, flags=re.I)

# Fallback to regex on plain text output
if not abstract_text:
    # Try to capture content after 'Abstract' heading up to a blank line or end of document
    m = re.search(r"(?is)abstract[:\s]*\n?(.*?)(?:\n\s*\n|\Z)", content)
    if m:
        abstract_text = m.group(1).strip()
    else:
        # Last fallback: attempt to find a short paragraph that looks like an abstract
        # (first paragraph after the title/metadata)
        paragraphs = [p.strip() for p in re.split(r"\n{2,}", content) if p.strip()]
        if paragraphs:
            abstract_text = paragraphs[0]

# Normalize whitespace
abstract_text = re.sub(r"\s+", " ", abstract_text).strip()


print("Extracted abstract text (first 500 chars):")
print(data_web[0].page_content[:500])

Extracted abstract text (first 500 chars):


 [1706.03762] Attention Is All You Need


































  











Happy Open Access Week from arXiv!
YOU make open access possible! Tell us why you support #openaccess and give to arXiv this week to help keep science open for all.


Donate!





Skip to main content






We gratefully acknowledge support from the Simons Foundation, member institutions, and all contributors.
Donate





 > cs > arXiv:1706.03762
  







Help | Advanced Search




All fields
Title
Author



In [33]:
print(f"Loaded {len(data_web)} document(s) from {url}")
print(data_web[0].page_content[:1000])

Loaded 1 document(s) from https://medium.com/@joerosborne/intro-to-web-scraping-build-your-first-scraper-in-5-minutes-1c36b5c4b110
Just a moment...Enable JavaScript and cookies to continue



In [38]:
# Load Wikipedia page for company "Infosys" into `data_wiki`, preferring community loaders,
# then LCWebBaseLoader, and finally falling back to requests + BeautifulSoup.

wiki_url = "https://en.wikipedia.org/wiki/Infosys"
data_wiki = None

# Try community loader candidates (if present in langchain_community.document_loaders)
for candidate in ("PlaywrightURLLoader", "SeleniumURLLoader", "WebBaseLoader", "UnstructuredURLLoader", "BeautifulSoupURLLoader"):
    loader_cls_candidate = getattr(langchain_community.document_loaders, candidate, None)
    if loader_cls_candidate:
        try:
            try:
                data_wiki = loader_cls_candidate([wiki_url]).load()
            except Exception:
                data_wiki = loader_cls_candidate(wiki_url).load()
            break
        except Exception:
            data_wiki = None

# Fallback to LCWebBaseLoader if community loaders weren't successful
if not data_wiki:
    try:
        data_wiki = LCWebBaseLoader(wiki_url).load()
    except Exception:
        data_wiki = None

# Final fallback: requests + BeautifulSoup
if not data_wiki:
    resp = requests.get(wiki_url, timeout=20)
    resp.raise_for_status()

    soup_local = bs4.BeautifulSoup(resp.text, "html.parser")
    for s in soup_local(["script", "style", "noscript", "iframe", "header", "footer", "nav"]):
        s.decompose()

    text = soup_local.get_text(separator="\n")
    text = re.sub(r"\n\s*\n+", "\n\n", text).strip()
    data_wiki = [Document(page_content=text, metadata={"source": wiki_url})]

# Sanity checks / output
print(f"Loaded {len(data_wiki)} document(s) from {wiki_url}")
print(data_wiki[0].page_content[:1000])

Loaded 1 document(s) from https://en.wikipedia.org/wiki/Infosys




Infosys - Wikipedia






























Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages



















Search











Search






















Appearance
















Donate

Create account

Log in








Personal tools





Donate Create account Log in




























Contents
move to sidebar
hide




(Top)





1
History








2
Services and products








3
Acquisitions








4
Listing and shareholding pattern








5
Operations




Toggle Operations subsection





5.1
Geographical presence








5.2
Training centre in Mysore








5.3
Employees








5.4
CEOs










6
Controversies




Toggle Controversies subsection





6.1
Settlement of visa and tax fraud 

In [None]:
from langchain_community.document_loaders import WikipediaLoader

# Load Wikipedia page for "Infosys" using WikipediaLoader
wiki_loader = WikipediaLoader(query="Infosys")
data_wiki_lc = wiki_loader.load()

print(f"Loaded {len(data_wiki_lc)} document(s) using WikipediaLoader.")
print(data_wiki_lc[0].page_content[:4000])

Loaded 25 document(s) using WikipediaLoader.
Infosys Limited is an Indian multinational technology company that offers information technology, business consulting, and outsourcing services. Founded in 1981 by seven engineers, the company is headquartered in Bengaluru and considered one of the Big Six Indian IT companies.
Infosys has also attracted controversies due to allegations of visa and tax fraud in the United States and for creating malfunctioning government websites.


== History ==
Infosys was founded by N. R. Narayana Murthy, Nandan Nilekani, Kris Gopalakrishnan, S. D. Shibulal, K. Dinesh, N. S. Raghavan, and Ashok Arora, with an initial capital of $250. It was incorporated as Infosys Consultants Private Limited in Pune on 2 July 1981, before relocating to Bangalore in 1983. Arora left the company in 1989 and sold his shares to the other co-founders.
In the 1980s, Infosys briefly made hardware products like electronic telex machines and keyboard concentrators. Its core busines

In [43]:
print(data_wiki_lc[20].page_content[:4000])

The south Indian city of Chennai is fast emerging as a destination for information technology outsourcing and has seen a growing number of IT parks being built here. Most of the upcoming complexes are being built along the IT Corridor and the southern suburb.


== List ==


== See also ==
List of tech parks in Kolkata
Software industry in Chennai


== References ==
