Test `Google Search API`

In [None]:
from langchain.tools import Tool
from langchain_community.utilities import GoogleSearchAPIWrapper

search = GoogleSearchAPIWrapper()

tool = Tool(
    name="Google Search",
    description="Search Google for recent results.",
    func=search.run,
)

tool.run("888 US Services Inc.")



Use `selenium`

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.webdriver import WebDriver 
from selenium.common.exceptions import ElementClickInterceptedException 

import time
import re
from bs4 import BeautifulSoup


def getFirstLinkFor( driver: WebDriver, company: str ):
    # Perform a Google search
    driver.get( f"https://www.google.com?hl=en&q={company}" )

    search_box = driver.find_element( By.NAME, "q" )
    # search_box.send_keys(company)
    # search_box.send_keys( Keys.RETURN )
    search_box.submit()

    results = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'search')))

    # print(driver.page_source)
    #print(results)

    links = results.find_elements(By.CSS_SELECTOR,"a")

    try:
        links[0].click()
    except ElementClickInterceptedException as e:
        # print(e.msg)
        pattern = r'Other element would receive the click: <(?:div|li|ul) class="(\w+)">'
        match = re.search(pattern, e.msg)
        if match:
            # print( "match:", match.group(1) )
            try:
                clickable = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, match.group(1))))
            except:
                clickable = None
            # clickable.click()
        else:
            raise Exception('it is not possible identify clickable element!')
    
    return ( links[0].get_attribute("href"), clickable )


Test per uso di `Selenium` per estrarre il primo link dei risultati di ricerca.

In [None]:
from urllib.parse import quote
# Set up the Chrome options for headless mode
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("accept-language=en")
# Initialize the Chrome driver
driver = webdriver.Chrome(options=chrome_options)

# link = getFirstLinkFor( driver, company="ACHMEA INTERNE DIENSTEN N.V." )
# link,clickable = getFirstLinkFor( driver, company=quote("AT&T Global Network Services (UK) B.V. - UK Branch") )
link,clickable = getFirstLinkFor( driver, company=quote("Al Nafitha International Information Technology Company") )

print(link)

clickable.click()

# time.sleep(10)
WebDriverWait(driver, 15).until(EC.url_changes(link))

# print(driver.page_source)
soup = BeautifulSoup(driver.page_source, "html.parser")
print( soup.text )

driver.quit()

Read / Write CSV using **pandas**

In [None]:
import pandas as pd
df = pd.read_csv('../docs/customers-list-update.csv', encoding='latin-1')

# print( df[1:] )

In [None]:
import numpy as np
from urllib.parse import quote
from bs4 import BeautifulSoup

import os, time
from langchain.agents import initialize_agent
from langchain.prompts import PromptTemplate
from langchain.chat_models import AzureChatOpenAI
from langchain.chains import LLMChain

# Set up the Chrome options for headless mode
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("accept-language=en")
# Initialize the Chrome driver
driver = webdriver.Chrome(options=chrome_options)

# add columns
# Check if 'url' column exists
if 'url' not in df.columns:    
    # Add 'industry' column
    df['url'] = None
# Check if 'industry' column exists
if 'industry' not in df.columns:    
    # Add 'industry' column
    df['industry'] = "unknown"


# Configure OpenAI API
llm = AzureChatOpenAI(deployment_name="document-parser", 
                    openai_api_key= os.getenv("AZURE_OPENAI_API_KEY"),
                    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"))

prompt_template = PromptTemplate(
    input_variables=["company"],
    template=""" 
Form the company presentation text attached below i want that you analyse it and return the industries which it belong. 
To do this you must follow the intruction below:

- the answer must contain only the industries definition as ole line separated by comma. Not include company name or other descriptions.

COMPANY PRESENTATION:
{company}                                               
                                               
""")

chain = LLMChain( llm=llm, prompt=prompt_template,verbose=False)

# df_filtered = df[26:27]
df_filtered = df[df['industry'] == "unknown"]
for index, row in df_filtered.iterrows():

    try:

        link,clickable = getFirstLinkFor( driver, company=quote(row['Customer']) )
        print( index, row['Customer'], link )

        # skip if already processed
        if isinstance(row['url'], float) and np.isnan(row['url']):
            df.at[ index, 'url' ] = link

        if not clickable is None:
            clickable.click()
            # time.sleep(10)
            WebDriverWait(driver, 10).until(EC.url_changes(link))
            time.sleep(1)
            # print(driver.page_source)
            soup = BeautifulSoup(driver.page_source, "html.parser")
            industry = chain.run(  soup.text )

            df.at[ index, 'industry' ] = industry
            print( index, industry )

    except Exception as e:
        print( row['Customer'], "error", str(e) )


driver.quit()

df.to_csv('../docs/customers-list-update.csv', index=False)


In [None]:
import os, time
from langchain.agents import initialize_agent
from langchain.prompts import PromptTemplate
from langchain.chat_models import AzureChatOpenAI

# Configure OpenAI API
llm = AzureChatOpenAI(deployment_name="document-parser", 
                    openai_api_key= os.getenv("AZURE_OPENAI_API_KEY"),
                    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"))


agent = initialize_agent([ tool ], llm, agent="zero-shot-react-description", verbose=False, handle_parsing_errors=True)

prompt_template = PromptTemplate.from_template(""" 
For the company with name "{company}" i want that you analyse the company names and return the industries which it belong. 
To do this you must follow the intruction below:

1. you must first try to infer industry names from the company name
2. if you cannot infer the industry names from the previous step then search on google for the company name and extract the industrie names from the web page
3. if any industries is found then you must return as answer only "UNKNOWN"

the answer must contain only the industry names and not the company name.

""")

for index, row in df[11:20].iterrows():

    try:
        industry = agent.run( prompt_template.format(company=row[0]) )

        print( industry )

        time.sleep(1)
    except Exception as e:
        print( row[0], "error", str(e) )



In [None]:
import os, time
from langchain.agents import initialize_agent
from langchain.prompts import PromptTemplate
from langchain.chat_models import AzureChatOpenAI

# Configure OpenAI API
llm = AzureChatOpenAI(deployment_name="document-parser", 
                    openai_api_key= os.getenv("AZURE_OPENAI_API_KEY"),
                    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"))


agent = initialize_agent([ tool ], llm, agent="zero-shot-react-description", verbose=False, handle_parsing_errors=True)

prompt_template = PromptTemplate.from_template(""" 
For the company with name "{company}" i want that you analyse the company names and return the industries which it belong. 
To do this you must follow the intruction below:

1. you must first try to infer industry names from the company name
2. if you cannot infer the industry names from the previous step then search on google for the company name and extract the industrie names from the web page
3. if any industries is found then you must return as answer only "UNKNOWN"

the answer must contain only the industry names and not the company name.

""")

for index, row in df[11:20].iterrows():

    try:
        industry = agent.run( prompt_template.format(company=row[0]) )

        print( industry )

        time.sleep(1)
    except Exception as e:
        print( row[0], "error", str(e) )

