In [86]:
"""
Strategy to Develop Web Scraping Bot:
1. Make a google search with company name
2. Filter the google searches using a vocabulary list of popular social media providers
3. If found, open the link and then extract the respective information (for now just handles)
4. If none found in the first 2 pages (limited to 2 pages to avoid lag) then move to the next
5. Collate recent 3 posts from these websites and collate them into a seperate column
6. Scrape latest activity from the social media websites.
"""

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import csv
import os
import time
from getpass import getpass
import requests
from pprint import pprint
import os

start_time = time.time()
API_KEY = "AIzaSyCrWm8f4RNqbLtlK4bMTfyVVAG-g5KqEE0"


social_media_sites = [
    "https://www.facebook.com/",
    "https://twitter.com/",
    "https://www.instagram.com/",
    "https://www.linkedin.com/",
    "https://www.pinterest.com/",
    "https://www.snapchat.com/",
    "https://www.tiktok.com/",
    "https://www.reddit.com/",
    "https://www.youtube.com/",
    "https://www.whatsapp.com/",
    "https://www.tumblr.com/",
    "https://www.flickr.com/",
    "https://www.quora.com/",
    "https://medium.com/",
    "https://discord.com/",
    "https://telegram.org/",
    "https://www.viber.com/",
    "https://www.wechat.com/",
    "https://line.me/",
    "https://vk.com/",
    'https://sg.linkedin.com/company/'
]



# MAKING A GOOGLE QUERY AND EXTRACTING HEADINGS
def google_search(query):
    url = f"https://www.google.com/search?q={query}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve search results. Status code: {response.status_code}")
        return None
    
def extract_headings_and_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    
    results = []
    for heading in headings:
        heading_text = heading.text.strip()
        link = heading.find_parent('a')
        if link:
            url = link.get('href')
            results.append({'heading': heading_text, 'url': url})

    return results

################################################################################################################

def filter_social_media(searched_data):
    filtered_sites = []
    for entry in searched_data:
        if (not entry['url']):
            continue
        

        for social_handles in social_media_sites:
            if social_handles in entry['url']:
                #successfully found a site O(n^2)
                filtered_sites.append(entry)
    
    return filtered_sites


def get_social_media_urls(organization_name):
    # This function should perform the necessary steps to get social media URLs for a given organization name
    # You can use your existing functions like google_search and extract_headings_and_links here
    # Make sure to return the list of social media URLs
    html_content = google_search(organization_name)
    if html_content:
        extracted_data = extract_headings_and_links(html_content)
        social_handles = filter_social_media(extracted_data)
        return [entry['url'] for entry in social_handles]
    

    return []

def get_top_links(organisation_name):
    html_content = google_search(organisation_name)
    link_limit = 3
    if html_content:
        extracted_data = extract_headings_and_links(html_content)
        urls = []

        isSocialLink = False
        for entry in extracted_data:
            if (not entry['url']):
                continue

            if (len(urls) < link_limit):
                for social_link in social_media_sites:
                    if (social_link in entry['url']):
                        isSocialLink = True
            
                if (not isSocialLink):
                    urls.append(entry['url'])
                isSocialLink = False
            else:
                break
                
        return urls
    return []

def extract_textual_content_from_links(list_links):
    # Send an HTTP request to the URL
    textual_extraction = ""
    total_char_limit = 2400
    total_space_avail = total_char_limit
    textual_data = []
    available_chars = []
    
    for link in list_links:
        try:
            response = requests.get(link)

            # Check if the request was successful (status code 200)
            
            if response.status_code == 200:
                # Parse the HTML content of the page
                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract all text from the page
                text = soup.get_text()
                text = soup.get_text().replace('\n', ' ')

                #make a percentage based split
                textual_data.append(text)

        
            else:
                # Print an error message if the request was not successful
                print(f"Error: Unable to fetch the content from {link}. Status code: {response.status_code}")
        except:
            # Handle the MissingSchema exception by printing an error message
            print(f"Error: Skipping link: {link}")

    total_len = 0
    for text in textual_data:
        total_len += len(text)
    
    for index in range(len(textual_data)):
        available_chars.append(round((len(textual_data[index])/total_len) * total_char_limit))
    
    for index in range(len(textual_data)):
        

        text = textual_data[index][:available_chars[index]]
        textual_extraction += text
            

    return textual_extraction


#Go to home page link, extract all textual content, feed it into a LLM, let the LLM extract the mission and description of what the company does.
#Next step after the LLM generation is to make RAG system and feedback function to improve retreival quality
#testing right now with just the SG data


In [53]:
dataset = pd.read_excel("./EnvNP_SG.xlsx")
dataset['Top Google Links'] = dataset['Name of organisation'].apply(get_top_links)
print(dataset)

                                 Name of organisation  \
0                      Nature Society Singapore (NSS)   
1                                       WWF Singapore   
2                                       Zero Waste SG   
3                                             PM.Haze   
4                     Centre for a Responsible Future   
5                          Ground-Up Initiative (GUI)   
6                Conservation International Singapore   
7                  Singapore Youth for Climate Action   
8                             Waterways Watch Society   
9                       Singapore Environment Council   
10                                        PlasticLite   
11                                    Repair Kopitiam   
12                               Foodscape Collective   
13                                          LepakInSg   
14                              Cicada Tree Eco-Place   
15                                      Earth Society   
16                           Gr

In [39]:
dataset.head()

Unnamed: 0,Name of organisation,Description of organisation,Mission/ Objectives/ Purpose,Programmes/ projects,Funding sources,Collaboration with government / businesses,Choice of Climate action,No. of employees,Geographical focus,Nationality,Social Medias,Top Google Links
0,Nature Society Singapore (NSS),The Nature Society (Singapore) or NSS is a non...,- Organise nature appreciation activities like...,"- guided nature walks, bird and butterfly watc...","Run by volunteers, the Society depends financi...",Yes - businesses,Advocacy/ Mitigation,43,"Singapore, Singapore",,['https://www.facebook.com/naturesocietysingap...,"[https://www.nss.org.sg/, https://en.wikipedia..."
1,WWF Singapore,WWF-Singapore was founded in March 2006 to eng...,SUSTAIN THE NATURAL WORLD FOR THE BENEFIT OF P...,Climate: Net-zero carbon & Sustainable finance...,- Donations from individuals\n- Major donors \...,Yes - businesses,Advocacy/ Mitigation,39+,"Singapore, Singapore",,"['https://www.facebook.com/wwfsg/', 'https://w...","[https://www.wwf.sg/, https://wwf.panda.org/ww..."
2,Zero Waste SG,Zero Waste SG is a charity and non-governmenta...,Leading the drive towards zero waste in Singap...,1. BYO Singapore\n2. Zero Waste School\n3. Let...,1. Donations\n2. Coporate funding\n3. In-kind ...,Yes - businesses and government agencies,Advocacy/ Mitigation,9,"Singapore, Singapore",Singaporean,"['https://www.facebook.com/zerowastesg/', 'htt...","[http://www.zerowastesg.com/, https://www.towa..."
3,PM.Haze,"People’s Movement to Stop Haze, known as PM Ha...",Vision: We envision a world where everyone fee...,1. Haze-Free Foodstand campaign\n2. Instagram ...,PM Haze is financially supported by the Singap...,Yes - businesses and schools,Advocacy/ Mitigation,9,"Singapore, Indonesia, Malaysia",Singaporean,"['https://sg.linkedin.com/company/pm-haze', 'h...","[https://www.pmhaze.org/, https://en.wikipedia..."
4,Centre for a Responsible Future,The Centre for a Responsible Future (CRF) is a...,We inspire and support people and organisation...,1. EarthFest\n2. Veganuary\n3. Community Partn...,- grants \n- business membership\n- individual...,Yes - businesses,Advocacy/ Mitigation,5,Singapore,Singaporean,"['https://www.facebook.com/crforgsg/', 'https:...","[https://www.crf.org.sg/, https://www.giving.s..."


In [61]:
dataset['New_Description'] = dataset['Top Google Links'].apply(extract_textual_content_from_links)
dataset.head()

Error: Unable to fetch the content from https://patron.groundupinitiative.org/. Status code: 403
Error: Unable to fetch the content from https://syca.sg/. Status code: 406
Error: Unable to fetch the content from https://syca.sg/about-2/. Status code: 406
Error: Skipping link: /search?q=Waterways+Watch+Society&sca_esv=be1f4daf33da1a0f&ei=6IS6ZdjzFpSu4-EPnaC14AY&start=10&sa=N
Error: Unable to fetch the content from https://cityofgood.sg/articles/repair-kopitiam/. Status code: 403
Error: Unable to fetch the content from https://cityofgood.sg/articles/foodscape-collective/. Status code: 403
Error: Unable to fetch the content from https://cicadatree.org.sg/. Status code: 403
Error: Unable to fetch the content from https://www.greendrinks.org/Singapore/clist. Status code: 406
Error: Unable to fetch the content from https://www.hemispheresfund.org/. Status code: 406
Error: Unable to fetch the content from https://cityhallsingapore.com/hemispheres-foundation/. Status code: 406
Error: Unable to

Unnamed: 0,Name of organisation,Description of organisation,Mission/ Objectives/ Purpose,Programmes/ projects,Funding sources,Collaboration with government / businesses,Choice of Climate action,No. of employees,Geographical focus,Nationality,Social Medias,Top Google Links,New_Description
0,Nature Society Singapore (NSS),The Nature Society (Singapore) or NSS is a non...,- Organise nature appreciation activities like...,"- guided nature walks, bird and butterfly watc...","Run by volunteers, the Society depends financi...",Yes - businesses,Advocacy/ Mitigation,43,"Singapore, Singapore",,['https://www.facebook.com/naturesocietysingap...,"[https://www.nss.org.sg/, https://en.wikipedia...",\r \tNature Society (Singapore)\r ...
1,WWF Singapore,WWF-Singapore was founded in March 2006 to eng...,SUSTAIN THE NATURAL WORLD FOR THE BENEFIT OF P...,Climate: Net-zero carbon & Sustainable finance...,- Donations from individuals\n- Major donors \...,Yes - businesses,Advocacy/ Mitigation,39+,"Singapore, Singapore",,"['https://www.facebook.com/wwfsg/', 'https://w...","[https://www.wwf.sg/, https://wwf.panda.org/ww...",WWF-Singapore | Home ...
2,Zero Waste SG,Zero Waste SG is a charity and non-governmenta...,Leading the drive towards zero waste in Singap...,1. BYO Singapore\n2. Zero Waste School\n3. Let...,1. Donations\n2. Coporate funding\n3. In-kind ...,Yes - businesses and government agencies,Advocacy/ Mitigation,9,"Singapore, Singapore",Singaporean,"['https://www.facebook.com/zerowastesg/', 'htt...","[http://www.zerowastesg.com/, https://www.towa...",Zero Waste SG – Leading the drive towa...
3,PM.Haze,"People’s Movement to Stop Haze, known as PM Ha...",Vision: We envision a world where everyone fee...,1. Haze-Free Foodstand campaign\n2. Instagram ...,PM Haze is financially supported by the Singap...,Yes - businesses and schools,Advocacy/ Mitigation,9,"Singapore, Indonesia, Malaysia",Singaporean,"['https://sg.linkedin.com/company/pm-haze', 'h...","[https://www.pmhaze.org/, https://en.wikipedia...",...
4,Centre for a Responsible Future,The Centre for a Responsible Future (CRF) is a...,We inspire and support people and organisation...,1. EarthFest\n2. Veganuary\n3. Community Partn...,- grants \n- business membership\n- individual...,Yes - businesses,Advocacy/ Mitigation,5,Singapore,Singaporean,"['https://www.facebook.com/crforgsg/', 'https:...","[https://www.crf.org.sg/, https://www.giving.s...",Centre for a Responsible Future ...


In [65]:
dataset['New_Description'][0]

"   \r \tNature Society (Singapore)\r                       \r     \xa0\r       Home Facebook RSS  Register Contact Us          News  News Press Release Nature News Events and Calendar Event Payment   Resources  Nature Watch Magazine Publications and Reports Forum Gallery Species Chec    Nature Society (Singapore) - Wikipedia                           Jump to content        Main menu      Main menu move to sidebar hide    \t\tNavigation \t   Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate      \t\tContribute \t   HelpLearn to editCommunity portalRecent changesUpload file      Languages  Language links are at the top of the page.                    Search            Search                              Create account  Log in         Personal tools       Create account Log in      \t\tPages for logged out editors learn more    ContributionsTalk                             Contents move to sidebar hide     (Top)      1History        2Major conservation issues  

In [63]:
pprint(dataset['Description of organisation'][0])

('The Nature Society (Singapore) or NSS is a non-government, non-profit '
 'organisation dedicated to the appreciation, conservation, study and '
 'enjoyment of the natural heritage in Singapore, Malaysia and the surrounding '
 'region. It was formerly known as the Singapore branch of the Malayan Nature '
 'Society. The branch was formed in 1954 and became Nature Society (Singapore) '
 'in 1991.')


In [66]:
!pip3 install llama-index 'google-generativeai>=0.3.0' matplotlib qdrant_client cohere protobuf~=4.21


Collecting llama-index
  Downloading llama_index-0.9.40-py3-none-any.whl (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting google-generativeai>=0.3.0
  Downloading google_generativeai-0.3.2-py3-none-any.whl (146 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.9/146.9 KB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting matplotlib
  Downloading matplotlib-3.8.2-cp39-cp39-macosx_10_12_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting qdrant_client
  Downloading qdrant_client-1.7.2-py3-none-any.whl (206 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m206.2/206.2 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cohere
  Downloading cohere-4.45-py3-none-any.whl (52 kB)
[2K     [9

In [67]:
!pip3 install trulens_eval==0.19.1 chromadb==0.4.18 openai==1.3.7 llama_index cohere kaleido python-multipart google-cloud-aiplatform==1.36.3 litellm==1.11.1 langchain==0.0.347 streamlit_javascript

Collecting trulens_eval==0.19.1
  Downloading trulens_eval-0.19.1-py3-none-any.whl (630 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m630.5/630.5 KB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting chromadb==0.4.18
  Downloading chromadb-0.4.18-py3-none-any.whl (502 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m502.4/502.4 KB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting openai==1.3.7
  Downloading openai-1.3.7-py3-none-any.whl (221 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.4/221.4 KB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-macosx_10_11_x86_64.whl (85.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting python-multipart
  Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)
[2K     [90m━

In [84]:
pip install google-generativeai


Collecting google-generativeai
  Using cached google_generativeai-0.3.2-py3-none-any.whl (146 kB)
Collecting protobuf
  Using cached protobuf-4.25.2-cp37-abi3-macosx_10_9_universal2.whl (394 kB)
Collecting google-api-core
  Using cached google_api_core-2.16.1-py3-none-any.whl (135 kB)
Collecting google-ai-generativelanguage==0.4.0
  Using cached google_ai_generativelanguage-0.4.0-py3-none-any.whl (598 kB)
Collecting google-auth
  Using cached google_auth-2.27.0-py2.py3-none-any.whl (186 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3
  Using cached proto_plus-1.23.0-py3-none-any.whl (48 kB)
Collecting googleapis-common-protos<2.0.dev0,>=1.56.2
  Using cached googleapis_common_protos-1.62.0-py2.py3-none-any.whl (228 kB)
Collecting pyasn1-modules>=0.2.1
  Using cached pyasn1_modules-0.3.0-py2.py3-none-any.whl (181 kB)
Collecting cachetools<6.0,>=2.0.0
  Using cached cachetools-5.3.2-py3-none-any.whl (9.3 kB)
Collecting rsa<5,>=3.1.4
  Using cached rsa-4.9-py3-none-any.whl (34 kB)
Collecting 

In [89]:
import google.generativeai as genai

def gemini_response(scraped_info):
    try:
        prompt = f"Can you give an elaborate one paragraph description about the company from this scraped info {scraped_info}?"
        genai.configure(api_key=API_KEY)
        model = genai.GenerativeModel("gemini-pro")
        response = model.generate_content(prompt)
        return response.text
    except:
        return "Failed to fetch a response"


In [90]:
dataset['LLM_Extracted_Text'] = dataset['New_Description'].apply(gemini_response)

In [97]:
pprint(dataset['LLM_Extracted_Text'][0])
pprint("")
pprint(dataset['Description of organisation'][0])

('The Nature Society (Singapore) is a non-profit organization dedicated to '
 'promoting the conservation of the natural environment and biodiversity in '
 'Singapore and the region. Founded in 1954, the society conducts various '
 'activities and programs to achieve its mission, including organizing nature '
 'walks, talks, and workshops; publishing nature-related books, magazines, and '
 'reports; conducting research on local flora and fauna; and advocating for '
 'the protection of natural habitats. The society also works closely with '
 'government agencies, educational institutions, and other organizations to '
 'raise awareness about environmental issues and promote sustainable '
 'practices. Additionally, the society offers resources and information on '
 'local wildlife, conservation initiatives, and environmental education '
 'through its website, publications, and social media platforms.')
''
('The Nature Society (Singapore) or NSS is a non-government, non-profit '
 'organisa

In [99]:
dataset.to_csv('./EnvNP_SG.csv')