In [129]:
import os
import pickle
import faiss
import numpy as np
import re
from datetime import datetime
import requests
import json
import spacy

from zoneinfo import ZoneInfo

# Get system's local timezone
import tzlocal
local_tz = tzlocal.get_localzone()

# Make datetime.now() timezone-aware using local timezone
local_now = datetime.now(ZoneInfo(str(local_tz)))

In [2]:
query = "Aditya with pratik in Goa or Nashik"

In [3]:
embed_dir = 'embed_store'

# loading up the indices from embed_dir
img_path_index = pickle.load(open(os.path.join(embed_dir, 'img_path_index.pkl'), 'rb'))
face_data = pickle.load(open(os.path.join(embed_dir, 'face_data.pkl'), 'rb'))
flatten_img_face_index = pickle.load(open(os.path.join(embed_dir, 'img_path_index_for_face.pkl'), 'rb'))
clip_embed = faiss.read_index(os.path.join(embed_dir, 'img_embeddings.bin'))
geo_data = np.load(os.path.join(embed_dir, 'geo_metadata.npy'), allow_pickle=True)

In [4]:
faces = set()
for face in face_data:
    if len(face['name'])>0:
        faces.add(face['name'].upper())
list(faces)

['YASH',
 'SHUBHAM',
 'ADITYA',
 'PRATIK',
 'AISHWARYA',
 'JINAY',
 'SHRUTI',
 'SAYALI',
 'NAMITA',
 'DHRUV']

In [5]:
for word in query.split(' '):
    if word.upper() in faces:
        print(word)


Aditya
pratik


In [6]:
def tokenize(query):
    return [tokens.strip().lower() for tokens in re.split(r",|:|'|\.|-|\s", query.split(sep="Location:")[-1]) if (tokens and len(tokens)>1)]

In [7]:
str(geo_data[0])


'Chungthang, Mangan, Sikkim, 737120, India'

In [8]:
geo_locs = set()
for address in geo_data:
    for word in tokenize(address):
        geo_locs.add(word.upper())
list(geo_locs)
print(len(geo_locs))

182


In [9]:
for word in query.split(' '):
    if word.upper() in faces:
        print(f'face: {word}')
    if word.upper() in geo_locs:
        print(f'location: {word}')


face: Aditya
face: pratik
location: Goa
location: Nashik


In [108]:
import spacy

nlp = spacy.load("en_core_web_lg")

def nlp_with_datetime_ner(query):
    doc = nlp(query)
    datetime_entities = [ent.text for ent in doc.ents if ent.label_ == "DATE" or ent.label_ == "TIME"]
    output_string = ", ".join(datetime_entities)
    return output_string


In [None]:


url = "http://0.0.0.0:8010/parse"

def duck_datetime(query):
    payload = {
        "locale": "en_US",
        "text": query,
        "dims": ["time"]
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}

    response = requests.post(url, data=payload, headers=headers)
    return response.json()
    # print(json.dumps(response.json(), indent=2))

def replacer(main_text, text, sub):
    return re.sub(re.escape(text), sub, main_text, flags=re.IGNORECASE)

def replace_methods(query1):
    rep_list = [
        ["monsoon", "June to September"],
        ["independence day", "15th August"],
        ["republic day", "26th January"]
    ]
    for rep in rep_list:
        query1 = replacer(query1, rep[0], rep[1])
    query1 = query1.replace(",","")
    query1 = query1.replace("from ","")
    return query1

def detect_grains(text):
    grains = set()
    if re.search(r'\b\d{1,2}\b', text):  # numbers like 27
        grains.add('day')
    if re.search(r'\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|march|april|june|july|august|september|october|november|december)\b', text, re.IGNORECASE):
        grains.add('month')
    if re.search(r'\b\d{4}\b', text):  # year like 2023
        grains.add('year')
    if re.search(r'\b\d{1,2}:\d{2}\b', text):  # time
        grains.add('minute')
        grains.add('hour')
    elif re.search(r'\b\d{1,2}\s*(am|pm)\b', text, re.IGNORECASE):
        grains.add('hour')
    # Add day and month grain if holidaybeta in duckling is found (add year on top if found using this function)
    return grains





In [132]:
# "from this morning" doesn't work because it only give 'from' and not 'to', replace all "from" before parsing?
# "around sunset" doesn't work
# keep a sunset-sunrise etc paired with after, before, during, around etc for them and manually program 

queries = [
    "Images taken during Diwali 2023",
    "Photos from Monsoon 2024",
    "Pictures captured on Republic day",
    "Show me photos from Holi 2022",
    "Find images from September 2023",
    "Photos taken between October 15, 2024, and November 15, 2024",
    "Images from last year's Durga Puja",
    "Images of garba during navratri",
    "Pictures from this morning in Bengaluru",
    "Show me photos from December 2023 in Mumbai",
    "Images taken on a Sunday morning",
    "Photos captured on Independence Day 2021",
    "Pictures from the summer of 2020",
    "Show me images taken around sunset last Tuesday",
    "Photos from Ganesh Chaturthi 2025",
    "Images from March to May 2023 ",
    "Pictures taken during lunchtime on April 10, 2025",
    "Show me photos from last Christmas",
    "Find images from the New Year's Eve party 2023",
    "Photos from the past week in Delhi",
    "Images captured after 7 PM in Chennai",
    "Pictures taken on Ugadi 2024",
    "Show me photos from the winter of 2022",
    "Images from Vishu 2025",
    "Photos from around 6 AM on a Monday",
    "Pictures from the Maharashtra election day 2024"
]

count = 0
for query in queries:
    print("-"*50)
    print(f"ORIG QUERY: {query}")
    query2 = replace_methods(query)
    print(f"NEW QUERY: {query2}")
    responses = duck_datetime(query2)
    for response in responses:
        if response['dim']=="time":
            # ==============================
            # HOLIDAYS
            # ==============================
            if "holidayBeta" in response['value']:
                print("HOLIDAY CASE")
                print(f"Date-Time QUERY: {response['body']}")
                # print(f"Spacy's response: {nlp_with_datetime_ner(query)}")
                print(f"isLatent: {response['latent']}")
                print(f"Contents: {response['value']}")

                # CASE 1 : single value specific year
                if response['value']['type']=='value' and len(response['value']['values'])==1 and response['value']['values'][0]['grain']=='day':
                    print(f"match the .date() of {datetime.fromisoformat(response['value']['value'])}")
                
                # CASE 2 : single value, non-specified year (repeats every year)
                # doesn't work on indian festivals given the date varies every year for many 
                if response['value']['type']=='value' and len(response['value']['values'])>1:
                    print(f"Date value: {datetime.fromisoformat(response['value']['value'])}")
                    print(f"Grain level: {response['value']['values'][0]['grain']}")
                    # Calculate delta between two dates
                    date1 = datetime.fromisoformat(response['value']['values'][0]['value'])
                    date2 = datetime.fromisoformat(response['value']['values'][1]['value'])
                    delta = date2 - date1
                    for grain, num in {'week':7, 'month':30, 'year':365}.items():
                        if delta.days//num==1:
                            print(f"Repeat frequency: {grain}")
                    print("Need to check grain from query if it is repetitive or specific day (filter for specific if date value above is in future?)")
                
                # CASE 3 : interval values, specific year
                if response['value']['type']=='interval' and len(response['value']['values'])==1:
                    print(f"Filter date at grain {response['value']['values'][0]['from']['grain']} from {datetime.fromisoformat(response['value']['values'][0]['from']['value'])} to {datetime.fromisoformat(response['value']['values'][0]['to']['value'])}")
                
                # CASE 4 : interval values, non-specified year
                # doesn't work on indian festivals given the date varies every year for many 
                if response['value']['type']=='interval' and len(response['value']['values'])>1:
                    print(f"Filter date at grain {response['value']['values'][0]['from']['grain']} from {datetime.fromisoformat(response['value']['values'][0]['from']['value'])} to {datetime.fromisoformat(response['value']['values'][0]['to']['value'])}")
                    print(f"Filter only day and month if year grain (direct or indirect) is missing in original query")
            
            # ==============================
            # OTHER DAYS
            # ==============================
            else:
                print(f"Date-Time QUERY: {response['body']}")
                # print(f"Spacy's response: {nlp_with_datetime_ner(query)}")
                print(f"isLatent: {response['latent']}")
                print(f"Contents: {response['value']}")

                # CASE 1 : single value specific year
                if response['value']['type']=='value' and len(response['value']['values'])==1 and response['value']['values'][0]['grain']=='day':
                    print(f"match the .date() of {datetime.fromisoformat(response['value']['value'])}")
                elif response['value']['type']=='value' and len(response['value']['values'])==1 and response['value']['values'][0]['grain']=='week':
                    print(f"match the week starting with .date() of {datetime.fromisoformat(response['value']['value'])}")
                elif response['value']['type']=='value' and len(response['value']['values'])==1 and response['value']['values'][0]['grain']=='month':
                    print(f"match the month of .date() of {datetime.fromisoformat(response['value']['value'])}")
                
                # CASE 2 : single value, non-specified year (repeats every year)
                # doesn't work on indian festivals given the date varies every year for many 
                elif response['value']['type']=='value' and len(response['value']['values'])>1:
                    print(f"Date value: {datetime.fromisoformat(response['value']['value'])}")
                    print(f"Grain level: {response['value']['values'][0]['grain']}")
                    # Calculate delta between two dates
                    date1 = datetime.fromisoformat(response['value']['values'][0]['value'])
                    date2 = datetime.fromisoformat(response['value']['values'][1]['value'])
                    delta = date2 - date1
                    for grain, num in {'week':7, 'month':30, 'year':365}.items():
                        if delta.days//num==1:
                            print(f"Repeat frequency: {grain}")
                    print("Need to check grain from query if it is repetitive or specific day (filter for specific if date value above is in future?)")
                
                # CASE 3 : interval values, specific year
                elif response['value']['type']=='interval' and len(response['value']['values'])==1:
                    print(f"Filter date at grain {response['value']['values'][0]['from']['grain']} from {datetime.fromisoformat(response['value']['values'][0]['from']['value'])} to {datetime.fromisoformat(response['value']['values'][0]['to']['value'])}")
                
                # CASE 4 : interval values, non-specified year
                # doesn't work on indian festivals given the date varies every year for many 
                elif response['value']['type']=='interval' and len(response['value']['values'])>1:
                    if datetime.fromisoformat(response['value']['values'][0]['from']['value']) <= local_now:
                        print(f"Filter date at grain {response['value']['values'][0]['from']['grain']} from {datetime.fromisoformat(response['value']['values'][0]['from']['value'])} to {datetime.fromisoformat(response['value']['values'][0]['to']['value'])}")
                #     print(f"Filter only day and month if year grain (direct or indirect) is missing in original query")
    print("-"*50)
    # count+=1
    # if count>2: break


--------------------------------------------------
ORIG QUERY: Images taken during Diwali 2023
NEW QUERY: Images taken during Diwali 2023
HOLIDAY CASE
Date-Time QUERY: Diwali 2023
isLatent: False
Contents: {'grain': 'day', 'holidayBeta': 'Diwali', 'type': 'value', 'value': '2023-11-12T00:00:00.000-08:00', 'values': [{'grain': 'day', 'type': 'value', 'value': '2023-11-12T00:00:00.000-08:00'}]}
match the .date() of 2023-11-12 00:00:00-08:00
--------------------------------------------------
--------------------------------------------------
ORIG QUERY: Photos from Monsoon 2024
NEW QUERY: Photos June to September 2024
Date-Time QUERY: June to September 2024
isLatent: False
Contents: {'from': {'grain': 'month', 'value': '2024-06-01T00:00:00.000-07:00'}, 'to': {'grain': 'month', 'value': '2024-10-01T00:00:00.000-07:00'}, 'type': 'interval', 'values': [{'from': {'grain': 'month', 'value': '2024-06-01T00:00:00.000-07:00'}, 'to': {'grain': 'month', 'value': '2024-10-01T00:00:00.000-07:00'}, 't

In [None]:
import json

def extract_grains_and_filter_type(duckling_output):
    """
    Extracts grains and determines if the query is an interval or grain-based filter from Duckling output.
    
    Args:
        duckling_output (list): Parsed Duckling output as a list of dictionaries.
    
    Returns:
        tuple: (set of grains, str indicating 'interval' or 'value')
    """
    grains = set()
    filter_type = None
    
    # Filter for time dimension entries
    time_entries = [entry for entry in duckling_output if entry.get('dim') == 'time']
    
    if not time_entries:
        return grains, None  # No time-related entries found
    
    # Process each time entry
    for entry in time_entries:
        value = entry.get('value', {})
        entry_type = value.get('type')
        
        # Determine filter type
        if entry_type == 'interval':
            filter_type = 'interval'
        elif entry_type == 'value':
            filter_type = 'value'
        
        # Extract grains
        if entry_type == 'interval':
            # Check 'from' and 'to' grains
            if 'from' in value:
                from_grain = value['from'].get('grain')
                if from_grain:
                    grains.add(from_grain)
            if 'to' in value:
                to_grain = value['to'].get('grain')
                if to_grain:
                    grains.add(to_grain)
        elif entry_type == 'value':
            # Check single grain
            grain = value.get('grain')
            if grain:
                grains.add(grain)
        
        # Map Duckling grains to required grains (year, month, day, hour, minute)
        if 'year' in grains:
            grains.add('year')
        if 'month' in grains:
            grains.add('month')
            grains.add('year')  # Month implies year
        if 'day' in grains:
            grains.add('day')
            grains.add('month')
            grains.add('year')  # Day implies month and year
        if 'hour' in grains:
            grains.add('hour')
            grains.add('day')
            grains.add('month')
            grains.add('year')  # Hour implies day, month, year
        if 'minute' in grains:
            grains.add('minute')
            grains.add('hour')
            grains.add('day')
            grains.add('month')
            grains.add('year')  # Minute implies hour, day, month, year
    
    return grains, filter_type

# Example usage with the provided sample output
sample_queries = 
count=0
for sample in sample_queries:
    query = sample['query']
    output = sample['output']
    grains, filter_type = extract_grains_and_filter_type(output)
    print(f"QUERY: {sample["query"]}")
    print(f"Grains: {grains}")
    print(f"Filter Type: {filter_type}")
    print("-" * 50)
    count+=1

QUERY: Images taken during Diwali 2023
Grains: {'day', 'year', 'month'}
Filter Type: value
--------------------------------------------------
QUERY: Photos from Monsoon 2024
Grains: {'year', 'month'}
Filter Type: interval
--------------------------------------------------
QUERY: Images taken on a Sunday morning
Grains: {'day', 'year', 'hour', 'month'}
Filter Type: interval
--------------------------------------------------


In [26]:
import datetime
from typing import List, Dict, Any, Tuple
import dateutil.parser
import calendar

def parse_duckling_temporal(duckling_output: List[Dict[str, Any]]) -> Tuple[datetime.datetime, datetime.datetime]:
    """
    Parse Duckling temporal output and return a datetime interval.
    Returns (start_datetime, end_datetime) for the temporal range.
    """
    
    def parse_datetime(dt_str: str) -> datetime.datetime:
        """Parse ISO datetime string to datetime object."""
        return dateutil.parser.isoparse(dt_str)

    def get_year_end(year: int) -> datetime.datetime:
        """Return the last moment of the given year."""
        return datetime.datetime(year, 12, 31, 23, 59, 59, 999999)

    def get_day_end(dt: datetime.datetime) -> datetime.datetime:
        """Return the end of the given day."""
        return dt.replace(hour=23, minute=59, second=59, microsecond=999999)

    # Initialize variables to track the earliest start and latest end
    start_dt = None
    end_dt = None

    for item in duckling_output:
        if item["dim"] != "time":
            continue
            
        value = item["value"]
        value_type = value["type"]

        if value_type == "interval":
            # Direct interval from Duckling (e.g., Navratri)
            from_dt = parse_datetime(value["from"]["value"])
            to_dt = parse_datetime(value["to"]["value"])
            
            if start_dt is None or from_dt < start_dt:
                start_dt = from_dt
            if end_dt is None or to_dt > end_dt:
                end_dt = to_dt

        elif value_type == "value":
            grain = value.get("grain")
            dt = parse_datetime(value["value"])

            if grain == "year":
                # Year grain: set interval from Jan 1 to Dec 31
                year_start = dt.replace(month=1, day=1, hour=0, minute=0, second=0, microsecond=0)
                year_end = get_year_end(dt.year)
                
                if start_dt is None or year_start < start_dt:
                    start_dt = year_start
                if end_dt is None or year_end > end_dt:
                    end_dt = year_end

            elif grain == "day":
                # Day grain: set interval for the entire day
                day_start = dt.replace(hour=0, minute=0, second=0, microsecond=0)
                day_end = get_day_end(dt)
                
                if start_dt is None or day_start < start_dt:
                    start_dt = day_start
                if end_dt is None or day_end > end_dt:
                    end_dt = day_end

    # If multiple years are detected (e.g., 2022 or 2023), merge them
    if start_dt is None or end_dt is None:
        raise ValueError("No valid time dimension found in Duckling output")

    # Ensure start_dt is before end_dt
    if start_dt > end_dt:
        start_dt, end_dt = end_dt, start_dt

    return start_dt, end_dt

# Example usage and test cases
def test_parse_duckling_temporal():
    # Test case 1: "photos of Dhruv in 2023 or 2022"
    test1 = [
        {
            "body": "in 2022",
            "dim": "time",
            "end": 24,
            "latent": False,
            "start": 17,
            "value": {
                "grain": "year",
                "type": "value",
                "value": "2022-01-01T00:00:00.000-08:00",
                "values": [
                    {
                        "grain": "year",
                        "type": "value",
                        "value": "2022-01-01T00:00:00.000-08:00"
                    }
                ]
            }
        },
        {
            "body": "2023",
            "dim": "time",
            "end": 32,
            "latent": False,
            "start": 28,
            "value": {
                "grain": "year",
                "type": "value",
                "value": "2023-01-01T00:00:00.000-08:00",
                "values": [
                    {
                        "grain": "year",
                        "type": "value",
                        "value": "2023-01-01T00:00:00.000-08:00"
                    }
                ]
            }
        }
    ]
    
    # Test case 2: "during Diwali 2024"
    test2 = [
        {
            "body": "Diwali 2024",
            "dim": "time",
            "end": 35,
            "latent": False,
            "start": 24,
            "value": {
                "grain": "day",
                "holidayBeta": "Diwali",
                "type": "value",
                "value": "2024-10-31T00:00:00.000-07:00",
                "values": [
                    {
                        "grain": "day",
                        "type": "value",
                        "value": "2024-10-31T00:00:00.000-07:00"
                    }
                ]
            }
        }
    ]
    
    # Test case 3: "Yash doing garba during Navratri"
    test3 = [
        {
            "body": "Navratri 2023",
            "dim": "time",
            "end": 37,
            "latent": False,
            "start": 24,
            "value": {
                "from": {
                    "grain": "day",
                    "value": "2023-10-15T00:00:00.000-07:00"
                },
                "holidayBeta": "Navaratri",
                "to": {
                    "grain": "day",
                    "value": "2023-10-25T00:00:00.000-07:00"
                },
                "type": "interval",
                "values": [
                    {
                        "from": {
                            "grain": "day",
                            "value": "2023-10-15T00:00:00.000-07:00"
                        },
                        "to": {
                            "grain": "day",
                            "value": "2023-10-25T00:00:00.000-07:00"
                        },
                        "type": "interval"
                    }
                ]
            }
        }
    ]

    # Run tests
    print("Test 1 (2022 or 2023):", parse_duckling_temporal(test1))
    print("Test 2 (Diwali 2024):", parse_duckling_temporal(test2))
    print("Test 3 (Navratri 2023):", parse_duckling_temporal(test3))

if __name__ == "__main__":
    test_parse_duckling_temporal()

TypeError: can't compare offset-naive and offset-aware datetimes