In [1]:
from datetime import datetime

#from langchain_core.pydantic_v1 import BaseModel, Field, validator
from pydantic import BaseModel, Field, field_validator
from typing import Union
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate

from dotenv import load_dotenv
import os

load_dotenv()


import dateparser
from dateutil import parser


  from .autonotebook import tqdm as notebook_tqdm


## Experimenting with Different Date Parsers
Chose to use dateparser as it is more flexible with date formats

In [12]:
print(parser.parse("Dec 4, 2024 @ 7:45am"))

ParserError: Unknown string format: Dec 4, 2024 @ 7:45am

In [22]:
#2024-12-04T07:45:00GMT+08:00
print(dateparser.parse("2024-12-04T07:45:00GMT+08:00"))

2024-12-04 07:45:00+08:00


In [5]:
bool(dateparser.parse("Dec 4, 2024 7:45am GMT+8"))

True

## Building Pydantic Model for Data Parsing and Validation

In [11]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    
    # other params...
)

In [23]:


dateparse_prompt = PromptTemplate.from_template("Extract date and time from the date_text, including timezone if available. Only include the date and time and nothing else, remove spaces and line breaks where possible.\n date_text: {date_text}")

dateparse_chain = dateparse_prompt | llm

dateparse_chain.invoke({"date_text": "Dec 4, 2024 @ 7:45am [GMT +8\n__](https://devpost.com/settings/preferences_and_eligibility#eligibility-\nsection)"}).content.strip()

'2024-12-04T07:45:00GMT+08:00'

In [6]:
class Event(BaseModel):
    name: str = Field("Name of event")
    link: str
    start_date: Union[datetime, None] = Field("Start date of event, put 'null' if not found")
    end_date: Union[datetime, None] = Field("End date of event, put 'null' if not found")
    location: str = Field("Location where event is held, put 'Online' if event is held virtually")
    registration_deadline: Union[datetime, None] = Field("Registration deadline of event, put 'null' if not found")
    description: str = Field("Brief 100 word description of the event, highlight the key themes and skills required")


    @field_validator("start_date", "end_date", "registration_deadline", mode="before")
    @classmethod
    def check_date(cls, v:str) -> datetime:
        if not v:
            return None


        llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-pro",
        temperature=0,
        max_tokens=None,
        timeout=200,
        max_retries=10,
        
        # other params...
    )
        dateparse_prompt = PromptTemplate.from_template("Extract date and time from the date_text, including timezone if available. Only include the date and time and nothing else.\n date_text: {date_text}")

        dateparse_chain = dateparse_prompt | llm

        date_str = dateparse_chain.invoke({"date_text": v}).content.strip()

        date = dateparser.parse(date_str)

        if date:
            return date
        else:
            raise ValueError(f"Error parsing date string: {date_str}")
        
    # OUTDATED: Pydantic v1
    # @validator("registration_deadline")
    # def registration_period_valid(cls, field):
    #     if field < datetime.now():
    #         raise ValueError("Registration period over")
    #     return field


## Scraping using ScrapegraphAI

In [7]:
import nest_asyncio
nest_asyncio.apply()

import json
from scrapegraphai.graphs import SmartScraperGraph


# Define the configuration for the scraping pipeline
graph_config = {
    "llm": {
        "api_key": os.getenv("GOOGLE_API_KEY"),
        "model": "google_genai/gemini-pro",
    },
    "verbose": True,
    "headless": True,
}

# Create the SmartScraperGraph instance
smart_scraper_graph = SmartScraperGraph(
    prompt="Obtain information about the event.",
    source="https://www.eventbrite.sg/e/for-seniors-preserve-your-treasured-memories-with-memorylane-tickets-1009108208817",
    config=graph_config,
    schema=Event
)

# Run the pipeline
result = smart_scraper_graph.run()
print(json.dumps(result, indent=4))

--- Executing Fetch Node ---
--- (Fetching HTML from: https://www.eventbrite.sg/e/for-seniors-preserve-your-treasured-memories-with-memorylane-tickets-1009108208817) ---
--- Executing ParseNode Node ---
--- Executing GenerateAnswer Node ---


{
    "name": "For Seniors Preserve Your Treasured Memories with MemoryLane",
    "link": "https://www.eventbrite.sg/e/for-seniors-preserve-your-treasured-memories-with-memorylane-tickets-1009108208817",
    "start_date": null,
    "end_date": null,
    "location": "Online",
    "registration_deadline": null,
    "description": "NA"
}


In [25]:
event_dict = dict(Event.model_validate(result))

In [26]:
event_dict

{'name': 'Google Chrome Built-in AI Challenge',
 'link': 'https://googlechromeai.devpost.com/',
 'start_date': None,
 'end_date': datetime.datetime(2024, 12, 4, 7, 45, tzinfo=<StaticTzInfo 'UTC'>),
 'location': 'Online',
 'registration_deadline': None,
 'description': "Dive into the world of cutting-edge AI with the Google Chrome Built-in AI\nChallenge! This hackathon invites developers to explore new ground by creating\nweb applications or Chrome Extensions that leverage Chrome’s built-in AI APIs\nand models, including Gemini Nano.\n\nThe APIs can give you access to:\n\n create dynamic user prompts (Prompt API, Prompt API in Chrome Extensions)\n\n distill complex information into clear insights (Summarization API)\n\nenable multilingual translation capabilities (Translation API)\n\ngenerate original, engaging text (Write API)\n\n improve your content with alternative options (Rewrite API)\n\n_Please note: all of these tasks take place in the browser, using the models\ndownloaded to yo

## Scrape Using Links

In [2]:
from scraper import scrape

In [4]:
with open("data/links.txt", "r") as f:
    data = f.read()

links = data.split("\n")

In [3]:
load_dotenv()

True

In [5]:
events_data = [scrape(l) for l in links if l]

--- Executing Fetch Node ---
--- (Fetching HTML from: https://www.eventbrite.sg/e/for-seniors-preserve-your-treasured-memories-with-memorylane-tickets-1009108208817) ---
--- Executing ParseNode Node ---
--- Executing GenerateAnswer Node ---
--- Executing Fetch Node ---
--- (Fetching HTML from: https://www.eventbrite.sg/e/thinker-registration-1008151567477) ---
--- Executing ParseNode Node ---
--- Executing GenerateAnswer Node ---


KeyboardInterrupt: 

In [None]:
import pandas as pd

df = pd.DataFrame(events_data)

df.to_csv("data/events_data.csv")



In [32]:
import csv
f = open('data/events.csv','w')
w = csv.DictWriter(f,event_dict.keys())
f.flush()
for l in links:
    w.writerow(dict(scrape(l)))
    f.flush()
    
f.close()

--- Executing Fetch Node ---
--- (Fetching HTML from: https://www.eventbrite.sg/e/for-seniors-preserve-your-treasured-memories-with-memorylane-tickets-1009108208817) ---
--- Executing ParseNode Node ---
--- Executing GenerateAnswer Node ---
--- Executing Fetch Node ---
--- (Fetching HTML from: https://www.eventbrite.sg/e/thinker-registration-1008151567477) ---
--- Executing ParseNode Node ---
--- Executing GenerateAnswer Node ---
--- Executing Fetch Node ---
--- (Fetching HTML from: https://www.eventbrite.sg/e/storytime-for-4-6-years-old-ang-mo-kio-public-library-early-read-tickets-1011037770187) ---
--- Executing ParseNode Node ---
--- Executing GenerateAnswer Node ---
--- Executing Fetch Node ---
--- (Fetching HTML from: https://www.eventbrite.sg/e/for-youthadults-preserve-your-treasured-memories-with-memorylane-tickets-1009128940827) ---
--- Executing ParseNode Node ---
--- Executing GenerateAnswer Node ---
--- Executing Fetch Node ---
--- (Fetching HTML from: https://www.eventbrite

OutputParserException: Invalid json output: 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE