In [34]:
"""
Strategy to Develop Web Scraping Bot:
1. Make a google search with company name
2. Filter the google searches using a vocabulary list of popular social media providers
3. If found, open the link and then extract the respective information (for now just handles)
4. If none found in the first 2 pages (limited to 2 pages to avoid lag) then move to the next
5. Collate recent 3 posts from these websites and collate them into a seperate column
6. Scrape latest activity from the social media websites.
"""

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import csv
import os
import time
from getpass import getpass
import requests
from pprint import pprint
import os

start_time = time.time()
api_key = "AIzaSyCrWm8f4RNqbLtlK4bMTfyVVAG-g5KqEE0"


social_media_sites = [
    "https://www.facebook.com/",
    "https://twitter.com/",
    "https://www.instagram.com/",
    "https://www.linkedin.com/",
    "https://www.pinterest.com/",
    "https://www.snapchat.com/",
    "https://www.tiktok.com/",
    "https://www.reddit.com/",
    "https://www.youtube.com/",
    "https://www.whatsapp.com/",
    "https://www.tumblr.com/",
    "https://www.flickr.com/",
    "https://www.quora.com/",
    "https://medium.com/",
    "https://discord.com/",
    "https://telegram.org/",
    "https://www.viber.com/",
    "https://www.wechat.com/",
    "https://line.me/",
    "https://vk.com/",
    'https://sg.linkedin.com/company/'
]



# MAKING A GOOGLE QUERY AND EXTRACTING HEADINGS
def google_search(query):
    url = f"https://www.google.com/search?q={query}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve search results. Status code: {response.status_code}")
        return None
    
def extract_headings_and_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    
    results = []
    for heading in headings:
        heading_text = heading.text.strip()
        link = heading.find_parent('a')
        if link:
            url = link.get('href')
            results.append({'heading': heading_text, 'url': url})

    return results

################################################################################################################

def filter_social_media(searched_data):
    filtered_sites = []
    for entry in searched_data:
        if (not entry['url']):
            continue
        

        for social_handles in social_media_sites:
            if social_handles in entry['url']:
                #successfully found a site O(n^2)
                filtered_sites.append(entry)
    
    return filtered_sites


def get_social_media_urls(organization_name):
    # This function should perform the necessary steps to get social media URLs for a given organization name
    # You can use your existing functions like google_search and extract_headings_and_links here
    # Make sure to return the list of social media URLs
    html_content = google_search(organization_name)
    if html_content:
        extracted_data = extract_headings_and_links(html_content)
        social_handles = filter_social_media(extracted_data)
        return [entry['url'] for entry in social_handles]
    

    return []

def get_top_links(organisation_name):
    html_content = google_search(organisation_name)
    link_limit = 3
    if html_content:
        extracted_data = extract_headings_and_links(html_content)
        urls = []

        isSocialLink = False
        for entry in extracted_data:
            if (not entry['url']):
                continue

            if (len(urls) < link_limit):
                for social_link in social_media_sites:
                    if (social_link in entry['url']):
                        isSocialLink = True
            
                if (not isSocialLink):
                    urls.append(entry['url'])
                isSocialLink = False
            else:
                break
                
        return urls
    return []

def extract_textual_content_from_links(list_links):
    # Send an HTTP request to the URL
    textual_extraction = ""
    total_char_limit = 2000

    for link in list_links:
        if (len(textual_extraction) < total_char_limit):
            response = requests.get(link)

            # Check if the request was successful (status code 200)
            if response.status_code == 200:
                # Parse the HTML content of the page
                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract all text from the page
                text = soup.get_text()
                text = soup.get_text().replace('\n', ' ')

                textual_extraction += text

                
            else:
                # Print an error message if the request was not successful
                print(f"Error: Unable to fetch the content from {link}. Status code: {response.status_code}")

    return textual_extraction


#Go to home page link, extract all textual content, feed it into a LLM, let the LLM extract the mission and description of what the company does.
#Next step after the LLM generation is to make RAG system and feedback function to improve retreival quality
#testing right now with just the SG data


In [35]:
dataset = pd.read_excel("./EnvNP_SG.xlsx")
dataset['Top Google Links'] = dataset['Name of organisation'].apply(get_top_links)
print(dataset)

                                 Name of organisation  \
0                      Nature Society Singapore (NSS)   
1                                       WWF Singapore   
2                                       Zero Waste SG   
3                                             PM.Haze   
4                     Centre for a Responsible Future   
5                          Ground-Up Initiative (GUI)   
6                Conservation International Singapore   
7                  Singapore Youth for Climate Action   
8                             Waterways Watch Society   
9                       Singapore Environment Council   
10                                        PlasticLite   
11                                    Repair Kopitiam   
12                               Foodscape Collective   
13                                          LepakInSg   
14                              Cicada Tree Eco-Place   
15                                      Earth Society   
16                           Gr

In [36]:
dataset.head()

Unnamed: 0,Name of organisation,Description of organisation,Mission/ Objectives/ Purpose,Programmes/ projects,Funding sources,Collaboration with government / businesses,Choice of Climate action,No. of employees,Geographical focus,Nationality,Social Medias,Top Google Links
0,Nature Society Singapore (NSS),The Nature Society (Singapore) or NSS is a non...,- Organise nature appreciation activities like...,"- guided nature walks, bird and butterfly watc...","Run by volunteers, the Society depends financi...",Yes - businesses,Advocacy/ Mitigation,43,"Singapore, Singapore",,['https://www.facebook.com/naturesocietysingap...,"[https://www.nss.org.sg/, https://en.wikipedia..."
1,WWF Singapore,WWF-Singapore was founded in March 2006 to eng...,SUSTAIN THE NATURAL WORLD FOR THE BENEFIT OF P...,Climate: Net-zero carbon & Sustainable finance...,- Donations from individuals\n- Major donors \...,Yes - businesses,Advocacy/ Mitigation,39+,"Singapore, Singapore",,"['https://www.facebook.com/wwfsg/', 'https://w...","[https://www.wwf.sg/, https://wwf.panda.org/ww..."
2,Zero Waste SG,Zero Waste SG is a charity and non-governmenta...,Leading the drive towards zero waste in Singap...,1. BYO Singapore\n2. Zero Waste School\n3. Let...,1. Donations\n2. Coporate funding\n3. In-kind ...,Yes - businesses and government agencies,Advocacy/ Mitigation,9,"Singapore, Singapore",Singaporean,"['https://www.facebook.com/zerowastesg/', 'htt...","[http://www.zerowastesg.com/, https://www.towa..."
3,PM.Haze,"People’s Movement to Stop Haze, known as PM Ha...",Vision: We envision a world where everyone fee...,1. Haze-Free Foodstand campaign\n2. Instagram ...,PM Haze is financially supported by the Singap...,Yes - businesses and schools,Advocacy/ Mitigation,9,"Singapore, Indonesia, Malaysia",Singaporean,"['https://sg.linkedin.com/company/pm-haze', 'h...","[https://www.pmhaze.org/, https://en.wikipedia..."
4,Centre for a Responsible Future,The Centre for a Responsible Future (CRF) is a...,We inspire and support people and organisation...,1. EarthFest\n2. Veganuary\n3. Community Partn...,- grants \n- business membership\n- individual...,Yes - businesses,Advocacy/ Mitigation,5,Singapore,Singaporean,"['https://www.facebook.com/crforgsg/', 'https:...","[https://www.crf.org.sg/, https://www.giving.s..."


In [18]:
dataset['New_Description'] = dataset['Organisation_Link'].apply(extract_textual_content_from_link)
dataset.head()

KeyError: 'Organisation_Link'

In [11]:
pprint(dataset['Description of organisation'][0])

('The Nature Society (Singapore) or NSS is a non-government, non-profit '
 'organisation dedicated to the appreciation, conservation, study and '
 'enjoyment of the natural heritage in Singapore, Malaysia and the surrounding '
 'region. It was formerly known as the Singapore branch of the Malayan Nature '
 'Society. The branch was formed in 1954 and became Nature Society (Singapore) '
 'in 1991.')
