# Requirements:

requests
beautifulsoup
json
re
openpyxl
pandas 
urllib
os
selenium
dotenv
google.generativeai


# Meetup:

In [1]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_meetup_events(keyword, location=None, daterange=None, eventType=None):
    # Base URL for the Meetup search
    base_url = 'https://www.meetup.com/find/?'
    
    # Construct the URL parameters
    params = []
    
    # Add the keyword (compulsory)
    if keyword:
        params.append(f"keywords={keyword.replace(' ', '%20')}")
    else:
        print("Keyword is required")
        return
    
    # Add location if provided
    if location:
        params.append(f"location={location}")
    
    # Add date range if provided
    if daterange:
        params.append(f"daterange={daterange}")
    
    # Add event type if provided
    if eventType:
        params.append(f"eventType={eventType}")
    
    # Construct the full URL by joining parameters with '&'
    url = base_url + '&'.join(params)
    
    print(f"Scraping URL: {url}")
    
    # Send a GET request to the constructed URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the div with the specified class
        max_w_narrow = soup.find(class_="max-w-narrow")
            
        # Initialize a list to store the extracted data
        results = []

        # Check if the max_w_narrow is found
        if max_w_narrow:
            # Find all child elements
            child_elements = max_w_narrow.find_all(True)  # True finds all tags
            
            for child in child_elements:
                # Find all <a> tags within the child elements
                links = child.find_all('a')
                
                for link in links:
                    # Get href and text from each <a> tag
                    href = link.get('href')  # Get href attribute
                    text = link.get_text(strip=True)  # Get text content
                    
                    # Append the data as a dictionary
                    results.append({
                        'text': text,
                        'href': href
                    })

            # Convert results to JSON format
            meetup_json_output = json.dumps(results, indent=4)
            print(meetup_json_output)  # Print the JSON output
            return meetup_json_output
            

        else:
            print("No results found for the specified class.")
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

# Example usage
meetup_json_output = scrape_meetup_events(
    keyword="generative ai in customer services",
    location="in--Pune",
    daterange="",
    eventType="EVENTS"
)


Scraping URL: https://www.meetup.com/find/?keywords=generative%20ai%20in%20customer%20services&location=in--Pune&eventType=EVENTS
[
    {
        "text": "Online Event",
        "href": "https://www.meetup.com/ai-for-fun-profit/events/305555224/?recId=25943e8a-a13b-47da-a4e3-05763835cf00&recSource=keyword_search&searchId=e9295eb0-a31a-4b33-9a9c-a52ef9f16dcb&eventOrigin=find_page$all"
    },
    {
        "text": "Tue, Jan 14 \u00b7 1:00 PM UTCUnlocking AI: Exploring Foundations, Creative Tools, Business & Career GrowthGroup name:AI For Fun and ProfitNew GroupGroup name:AI For Fun and ProfitNew Group",
        "href": "https://www.meetup.com/ai-for-fun-profit/events/305555224/?recId=25943e8a-a13b-47da-a4e3-05763835cf00&recSource=keyword_search&searchId=e9295eb0-a31a-4b33-9a9c-a52ef9f16dcb&eventOrigin=find_page$all"
    },
    {
        "text": "Online Event",
        "href": "https://www.meetup.com/bangalore-computer-vision-meetup-group/events/304308867/?recId=25943e8a-a13b-47da-a4e3-05

In [2]:
import json
import re
import openpyxl
from urllib.parse import urlparse
import pandas as pd


def parse_event_text(event_text):
    """
    Parse the text of an event to extract the day, date, time, group name, and keywords.
    """
    # Initialize the data dictionary
    event_data = {
        "day": None,
        "date": None,
        "time": None,
        "group_name": None,
        "keywords": None
    }
    
    # Use regex to extract the date, time, and UTC from the text
    date_time_pattern = re.compile(r'(\w{3}), (\w{3} \d{1,2}) \u00b7 (\d{1,2}:\d{2} (?:AM|PM) UTC)')
    match = date_time_pattern.search(event_text)
    
    if match:
        event_data["day"] = match.group(1)  # e.g., "Fri"
        event_data["date"] = match.group(2)  # e.g., "Oct 18"
        event_data["time"] = match.group(3)  # e.g., "7:00 AM UTC"

        # Remove everything before and including UTC to get the keywords
        utc_index = event_text.find("UTC")
        if utc_index != -1:
            event_data["keywords"] = event_text[utc_index + len("UTC"):].strip()
        
    return event_data

def extract_group_name_from_url(url):
    """
    Extracts the group name from the meetup URL.
    """
    parsed_url = urlparse(url)
    # Get the path from the URL, which contains the group name
    path = parsed_url.path
    # Extract the group name, which is the second part of the path after 'meetup.com'
    group_name = path.split('/')[1].replace('-', ' ').title()  # Replace dashes with spaces and capitalize
    return group_name

def append_events_to_excel(json_data, excel_file):
    """
    Append event data from a JSON object to an Excel file, parsing the text field.
    """
    # Load the existing Excel file or create a new one
    try:
        workbook = openpyxl.load_workbook(excel_file)
    except FileNotFoundError:
        workbook = openpyxl.Workbook()

    # Select the first sheet or create it
    if 'Sheet1' in workbook.sheetnames:
        sheet = workbook['Sheet1']
    else:
        sheet = workbook.active
        sheet.title = 'Sheet1'
    
    # Add headers if the sheet is new
    if sheet.max_row == 1:
        headers = ['Day', 'Date', 'Time', 'Keywords', 'Group Name', 'Event URL']
        sheet.append(headers)

    # Loop through the JSON data and filter for relevant events
    for event in json_data:
        # Only consider events that match the date and time pattern
        if re.search(r'\w{3}, \w{3} \d{1,2} \u00b7 \d{1,2}:\d{2} (?:AM|PM) UTC', event['text']):
            # Parse the event text
            parsed_event = parse_event_text(event['text'])
            group_name = extract_group_name_from_url(event['href'])

            # Append the parsed data and URL to the sheet
            sheet.append([
                parsed_event['day'],
                parsed_event['date'],
                parsed_event['time'],
                parsed_event['keywords'],
                group_name,
                event['href']
            ])

    # Save the Excel file
    workbook.save(excel_file)
    print(f"Events successfully appended to {excel_file}")


def remove_duplicate_events(excel_file, output_file=None):
    """
    Reads an Excel file, removes duplicate entries based on the 'Event URL' column,
    and saves the cleaned data back to an Excel file.
    
    Parameters:
    - excel_file: str, path to the input Excel file
    - output_file: str, path to the output Excel file (if None, overwrites the input file)
    """
    # Load existing data from the Excel file
    try:
        df = pd.read_excel(excel_file)
    except FileNotFoundError:
        print("The specified file does not exist.")
        return

    # Check if 'Event URL' column exists
    if 'Event URL' not in df.columns:
        print("The 'Event URL' column does not exist in the Excel file.")
        return
    
    # Remove duplicate entries based on the 'Event URL' column
    df_unique = df.drop_duplicates(subset=['Event URL'])

    # Save the cleaned data back to an Excel file
    if output_file is None:
        output_file = excel_file  # Overwrite the input file

    df_unique.to_excel(output_file, index=False)
    print(f"Duplicate entries have been removed. Cleaned data saved to {output_file}.")

# Example usage





# Convert the string to a Python list of dictionaries
event_list = json.loads(meetup_json_output)

# Call the function to append the events to Excel
append_events_to_excel(event_list, 'Meetup_events_1.xlsx')

remove_duplicate_events('Meetup_events_1.xlsx')



Events successfully appended to Meetup_events_1.xlsx
Duplicate entries have been removed. Cleaned data saved to Meetup_events_1.xlsx.


# EventBrite:

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

# Function to construct the URL dynamically based on parameters
def construct_url(event_type=None, location='india--pune', keyword=None):
    # Base URL
    base_url = "https://www.eventbrite.com/d/"
    
    # Ensure that the keyword is provided
    if not keyword:
        raise ValueError("Keyword is required to construct the URL.")
    
    # Construct URL parts based on provided parameters
    url_parts = []
    
    # Add event type (online or empty)
    if event_type and event_type.lower() == 'online':
        url_parts.append('online/')
    
    # Add location
    url_parts.append(f"{location}/")
    
    # Add keyword (mandatory)
    url_parts.append(f"{keyword}/")
    
    # Join all parts to form the final URL
    final_url = base_url + ''.join(url_parts)
    return final_url

# Function to fetch and scrape event details from a webpage
def scrape_event_data(url):
    # Send a request to the webpage
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Initialize a list to hold event data
    event_data = []
    event_names = set()  # Set to track unique event names

    # Find all event containers by div class
    event_containers = soup.find_all('div', class_='Stack_root__1ksk7')

    for event in event_containers:
        event_info = {}

        # Extract Event URL (href)
        event_link = event.find('a', class_='event-card-link')
        if event_link:
            event_name = event_link['aria-label']

            # Skip adding the event if the event name is already in the set
            if event_name in event_names:
                continue
            event_info['Event URL'] = event_link['href']
            event_info['Event Name'] = event_name

            # Add event name to the set
            event_names.add(event_name)

        # Extract all <p> tags with the relevant class
        p_tags = event.find_all('p', class_='Typography_root__487rx')

        # Filter out the specific unwanted <p> tag based on its class
        p_tags = [p for p in p_tags if 'EventCardUrgencySignal__label' not in p['class']]

        # Check if we still have any valid <p> tags
        if p_tags:
            # Extract date-time from the first remaining <p> tag
            event_info['Date and Time'] = p_tags[0].text

            # Check for event location in the second <p> tag, if it exists
            if len(p_tags) > 1:
                event_info['Event Location'] = p_tags[1].text
            else:
                # No second <p> tag, use the data-event-location attribute
                event_info['Event Location'] = event_link.get('data-event-location', None)

        # Append the event information to the list
        event_data.append(event_info)

    return event_data

# Function to append data to an existing Excel file, remove duplicates, and save
def append_to_excel(event_data, file_name):
    # Load existing data if the file exists
    if os.path.exists(file_name):
        existing_df = pd.read_excel(file_name)
        new_df = pd.DataFrame(event_data, columns=['Event URL', 'Event Name', 'Event Location', 'Date and Time'])

        # Append the new data
        combined_df = pd.concat([existing_df, new_df], ignore_index=True)

        # Remove duplicates based on the 'Event Name' column
        combined_df.drop_duplicates(subset='Event Name', keep='first', inplace=True)
    else:
        # If the file doesn't exist, just create a new DataFrame
        combined_df = pd.DataFrame(event_data, columns=['Event URL', 'Event Name', 'Event Location', 'Date and Time'])

    # Write the DataFrame back to the Excel file
    combined_df.to_excel(file_name, index=False)

# Parameters provided by user
event_type = 'online'  # or None for in-person events
location = 'india--mumbai'  # or use the default 'india--pune'
keyword = 'customer service'  # This is mandatory

# Construct the URL dynamically
url = construct_url(event_type=event_type, location=location, keyword=keyword)

# Scrape event data
event_data = scrape_event_data(url)

# Append to the Excel file and remove duplicates
append_to_excel(event_data, 'Eventbrite_events_1.xlsx')

print(f"Scraping completed for URL: {url} and data saved/updated to 'event_data_new.xlsx'")


# Twitter:

In [None]:
import time
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys

# Define the hashtags to search for
hashtags = ['upcoming genai summits', '#GenerativeAI', '#CustomerServiceSummit', '#AIConferences', '#ServiceSummit']

# Function to extract mentions, hashtags, and links from tweet text
def extract_details(text):
    mentions = re.findall(r'@(\w+)', text)   # Captures Twitter mentions
    hashtags = re.findall(r'#(\w+)', text)   # Captures hashtags
    links = re.findall(r'http[s]?://\S+', text)  # Captures all links (http or https)
    
    return mentions, hashtags, links

# Function to scrape tweets for a given hashtag in a specific section (either "Top" or "Latest")
def scrape_tweets(driver, hashtag, section):
    # Locate and clear the search box
    try:
        search_box = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, '//input[@aria-label="Search query"]'))
        )
        time.sleep(5)  # Wait after finding the search box
        search_box.click()  # Click to focus
        time.sleep(5)  # Wait after clicking the search box
        search_box.clear()  # Clear any existing text
        search_box.send_keys(Keys.CONTROL + "a")  # Select all text (CMD + "a" on Mac)
        search_box.send_keys(Keys.BACKSPACE)  # Delete selected text

        # Enter the hashtag and submit
        search_box.send_keys(hashtag)
        time.sleep(5)  # Wait after entering the hashtag
        search_box.send_keys(Keys.RETURN)
        time.sleep(5)  # Wait after pressing return
    except TimeoutException:
        print("Search box not found.")
        return []

    # Wait for the search results to load and navigate to the specified section
    try:
        section_tab = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, f'//span[text()="{section}"]'))
        )
        time.sleep(5)  # Wait after locating the section tab
        section_tab.click()
        time.sleep(5)  # Wait after clicking the section tab
    except TimeoutException:
        print(f"{section} tab not found.")
        return []

    # Wait for tweets to load
    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, '//article[@role="article"]'))
        )
        time.sleep(10)  # Wait after tweets have loaded
    except TimeoutException:
        print("Tweets did not load in time.")
        return []

    # Scroll down to load more tweets
    for _ in range(3):  # Adjust the range to scroll more or less
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)  # Wait after each scroll to allow tweets to load

    # Extract tweet elements
    tweets_data = []
    tweet_elements = driver.find_elements(By.XPATH, '//article[@role="article"]')

    for tweet in tweet_elements:
        # Extract the username, date, and text content
        try:
            username = tweet.find_element(By.XPATH, './/span[contains(text(), "@")]').text
            date = tweet.find_element(By.XPATH, './/time').get_attribute("datetime")
            text = tweet.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
            
            # Extract mentions, hashtags, and links from the text
            mentions, hashtags_in_text, links = extract_details(text)

            # Store the collected information
            tweets_data.append({
                'section': section,
                'hashtag': hashtag,
                'username': username,
                'date': date,
                'text': text,
                'mentions': mentions,
                'hashtags': hashtags_in_text,
                'links': links
            })

        except Exception as e:
            print(f"Error extracting tweet data: {e}")
            continue

    return tweets_data

# Main script
def main():
    options = Options()
    options.headless = False  # Set to True if you don't want to see the browser
    service = Service('c:\\Program Files\\chromedriver.exe')  # Update this to the correct path
    driver = webdriver.Chrome(service=service, options=options)

    try:
        print("Please log in to Twitter manually.")
        driver.get("https://twitter.com/login")
        time.sleep(5)  # Wait after loading the login page
        
        # Wait for manual login
        WebDriverWait(driver, 600).until(
            EC.presence_of_element_located((By.XPATH, '//input[@aria-label="Search query"]'))
        )
        
        print("Login successful. Now scraping tweets...")

        # Collect tweets data
        all_tweets_data = []

        for hashtag in hashtags:
            print(f"Searching and scraping tweets for {hashtag} in Top section...")
            top_tweets = scrape_tweets(driver, hashtag, "Top")
            all_tweets_data.extend(top_tweets)

            print(f"Searching and scraping tweets for {hashtag} in Latest section...")
            latest_tweets = scrape_tweets(driver, hashtag, "Latest")
            all_tweets_data.extend(latest_tweets)

        # Create a DataFrame for better organization
        tweets_df = pd.DataFrame(all_tweets_data)

        # Save the results to an Excel file
        tweets_df.to_excel('twitter_events_1.xlsx', index=False)
        print("Data extraction complete! Check 'twitter_events.xlsx' for results.")

        # Print message to manually log out
        print("Data scraping complete. Please log out of Twitter manually.")
    except:
        print("Some error occured...")
        
    finally:
        # Do not close the browser; wait for manual close
        print("You can close the browser window manually when done.")

if __name__ == "__main__":
    main()


In [None]:
import requests
import pandas as pd
import re  # Import regular expressions
from dotenv import load_dotenv
import os
import google.generativeai as genai

load_dotenv()

api_key1 = os.getenv('API_KEY1')
api_key2 = os.getenv('API_KEY2')
api_key3 = os.getenv('API_KEY3')
API_Vault = [api_key1, api_key2, api_key3]
api_key_index = 0

def analyze_review(review):
    def limit_text_by_word_count(text, max_words):
        words = text.split()
        if len(words) > max_words:
            return ' '.join(words[:max_words])
        return text

    template = """You are expert in parsing the text. 
    Understand the given text and extract the following from the provided text:
    1) Is this Tweet related to any seminar/conference/event/summit/discussion.
    2) Topic of seminar/discussion/meeting/conference/summits/etc.
    3) Date and time of event if not mentioned mention 
    4) Location of event if online mention online.

    Give the answer in highly structured format like: Event_name:[],'Topic_name':[],'Date_of_event':[],'Time_of_event':[],'Location_of_event':[] do not include additional explanation only provide answer in structure.
    ### tweet text to perform analysis on:
     {text}

    ---"""

    def get_next_api_key():
        global api_key_index
        api_key = API_Vault[api_key_index]
        api_key_index = (api_key_index + 1) % len(API_Vault)
        return api_key

    max_input_words = 50000
    limited_text = limit_text_by_word_count(review, max_input_words)

    genai.configure(api_key=get_next_api_key())

    generation_config = {
        "temperature": 1,
        "top_p": 0.95,
        "top_k": 64,
        "max_output_tokens": 8192,
        "response_mime_type": "text/plain",
    }

    model = genai.GenerativeModel(
        model_name="gemini-1.5-flash",
        generation_config=generation_config,
    )

    chat_session = model.start_chat(history=[])

    prompt_text = template.format(text=limited_text)

    response = chat_session.send_message(prompt_text)

    return response.text


def extract_json(response_text):
    # Use regex to find content within the first set of curly braces
    match = re.search(r'\{(.*?)\}', response_text)
    if match:
        json_content = match.group(1)  # Get the content between the first pair of curly braces
        return json_content
    return None  # Return None if no curly braces found


def append_reviews_to_excel(all_reviews, file_name):
    # Load existing data
    try:
        existing_df = pd.read_excel(file_name)
    except FileNotFoundError:
        existing_df = pd.DataFrame()  # Create an empty DataFrame if file does not exist

    # Convert the list of review analyses to DataFrame
    new_data_df = pd.DataFrame(all_reviews)

    # Concatenate existing and new data along columns
    combined_df = pd.concat([existing_df, new_data_df], axis=1)

    # Save back to the same file without removing previous data
    combined_df.to_excel(file_name, index=False)
    print(f"Appended reviews to {file_name}")


def llm_analysis(path):
    df = pd.read_excel(path)
    all_review_analysis = []
    for index, row in df.iterrows():
        cell_content = row['text']  # Adjust based on the actual column name
        review_analysis = analyze_review(cell_content)
        print(index, "@@@@", review_analysis)

        # Extract the JSON content from the response
        json_content = extract_json(review_analysis)
        
        if json_content:
            # Convert the extracted JSON string into a dictionary
            try:
                # Convert the string to a valid JSON format
                json_dict = eval(f"{{{json_content}}}")  # Using eval to convert the extracted content to a dict
                all_review_analysis.append(json_dict)
            except Exception as e:
                print(f"Error parsing JSON content for index {index}: {e}")
                all_review_analysis.append({"Event": "", "Topic": "", "Date": "", "Time": "", "Location": ""})  # Default empty response for errors
        else:
            print(f"No valid JSON content found for index {index}.")
            all_review_analysis.append({"Event": "", "Topic": "", "Date": "", "Time": "", "Location": ""})  # Default empty response

    append_reviews_to_excel(all_review_analysis, file_name=path)


# Run the analysis on the existing file
llm_analysis("twitter_events_1.xlsx")
