<a href="https://colab.research.google.com/github/barbaroja2000/llm/blob/main/Langchain_%26_OpenAi_AWS_Summit_2023_London_Sponser_Categorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Langchain & OpenAi - AWS Summit 2023 London Sponser Categorization

This notebook categorizes all the companies sponsering the 2023 Summit , into the following categories:

* Security
* Managed Service Providers (MSPs)
* Landing Zone/Infrastructure Providers
* Training Partners
* Consulting Partners/Systems Integrators
* Software/Application Providers
* Data Management Providers
* Observability
* AI/ML

Process:

1.  Parse the sponser page, pulling out all non AWS pages into a list
2. Spider these uris pulling out title & description 
3. Feed these into a GPT model and categorize
4. Display results in a Pandas table

Requires OpenAi Key:

```Python
OPENAI_API_KEY="abc"
```

In [None]:
#@title Load Keys
#@markdown Utitily to load keys from fs, replace with environ vars if not using

import os

#os.environ.get("OPENAI_API_KEY")
#os.environ.get("HUGGINGFACE_API_KEY")

!python -m pip install python-dotenv
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
import dotenv
dotenv.load_dotenv('/content/drive/MyDrive/keys/keys.env')

In [None]:
sponser_page = "https://aws.amazon.com/events/summits/london/sponsors/"

In [None]:
#@title Parse hrefs
#@markdown Exclude all local hrefs and anything AWS related

from bs4 import BeautifulSoup
import requests

candidates = []
#required as many pages will return 403 forbidden without user-agent string
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
result = requests.get(sponser_page, headers=headers)
parser = 'html.parser' 
page = requests.get(sponser_page)
soup = BeautifulSoup(page.text, parser)

for link in soup.find_all('a', href=True):
  x = link['href'].find("https://")
  y =  link['href'].find("aws")
  if x == 0 and  y ==-1:
    domain = link['href'].split("/")
    candidates.append(domain[2:3].pop())

In [None]:
print(candidates[:10])

In [None]:
#@title Crawl websites
#@markdown Pull title and description back from html

candidates_for_categorization = []

for x,i in enumerate(candidates):
  try:
    uri = f"https://{i}"
    resp = requests.get(uri, timeout=10, headers=headers)
    soup = BeautifulSoup(resp.text, parser)
    description  = soup.find("meta", property="og:description") or soup.find("meta", property="description")
    title  = soup.find("meta", property="og:title") or soup.find("meta", property="title") 
    description = description["content"] if description else None
    title = title["content"] if title else None
    tmp_dict = {"title": title, "description": description, "url": uri}
    candidates_for_categorization.append(tmp_dict)

  except Exception as e:
    print(e)

In [None]:
print(len(candidates_for_categorization))

In [None]:
!pip install langchain openai > /dev/null
from langchain import PromptTemplate, LLMChain
from langchain.llms import OpenAI

model_name= 'text-davinci-003' #@param ["text-davinci-003", "gpt-4"]
llm = OpenAI(model_name=model_name, temperature=0)

In [None]:
template = """Categories listed here are types of cloud companies.

Security
Managed Service Providers (MSPs)
Landing Zone/Infrastructure Providers
Training Partners
Consulting Partners/Systems Integrators
Software/Application Providers
Data Management Providers
Observability
AI/ML

Using the description of companies below classify each into the preceeding categories. 
If the company matches multiple categories, return all matching in a comma seperated list
If your confidence is poor for the given classifications, propose a new classification

Blurb: {blurb}
==============================================================
Classification: """

prompt = PromptTemplate(
    input_variables=["blurb" ],
    template=template
)

llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
final = []
for i in candidates_for_categorization:
  blurb = f"{i['title']} {i['description']}"
  category = llm_chain.run(blurb)
  if "title" in i and i["title"] is not None:
    i["category"] = [x.strip(' ') for x in category.split(", ")]
    final.append(i)

In [None]:
import pandas as pd
df.drop(df.index, inplace=True)
df = pd.DataFrame(final)

In [None]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [None]:
#@title One hot encode the category column
df_ohe = df.drop(["category", "description", "company_category"], 1).join(df.category.str.join('|').str.get_dummies())

In [None]:
#@title Filter by Category
#@markdown Click on `filter` and enter 1 in the from field against the category
df_ohe

In [None]:
df_sum = df_ohe.drop(['title','url'], 1)

In [None]:
#@title Sum Companies by Categorization
df_sum.sum()

In [None]:
#@title Save out
file_name=f"aws-summit-sponsers-{model_name}.csv"
df.to_csv(f"/content/drive/MyDrive/{file_name}", encoding='utf-8', index=False)