In [31]:
import pandas as pd
import json
import re
from bs4 import BeautifulSoup
import os
from datetime import datetime

french_chars = "@àâäçéèêëîïôùûüÿÀÂÄÇÉÈÊËÎÏÔÙÛÜŸ.:-/+’',"

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text(separator=" ")

def find_and_replace_ambiguous_unicode(text, replacement=""):
    text = text.replace('\n', ' ').replace('\t', ' ')
    non_ascii_pattern = re.compile(r'[^a-zA-Z0-9\s' + re.escape(french_chars) + r']')
    return non_ascii_pattern.sub(replacement, text)

def clean_data(data, replacement=""):
    cleaned_data = []
    for item in data:
        cleaned_item = {k: find_and_replace_ambiguous_unicode(remove_html_tags(str(v)), replacement) for k, v in item.items()}
        cleaned_data.append(cleaned_item)
    return cleaned_data

def is_valid_date(date_str):
    try:
        # Try to parse the date string
        datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S%z")
        return True
    except (ValueError, TypeError):
        return False

def parse_audience(audience_str):
    # Define categories of audience types
    audience_types = ["tout-petits", "enfants", "jeunes", "adultes", "Tout public"]
    
    # Initialize the parsed audience
    parsed_audience = {}
    
    # Check if audience_str is empty or None
    if not audience_str:
        parsed_audience["types"] = ["nan"]
    else:
        # Parse audience types
        parsed_audience["types"] = [audience_type for audience_type in audience_types if audience_type in audience_str]

        # Extract the starting age
        start_age_match = re.search(r"A partir de (\d+) ans", audience_str)
        end_age_match = re.search(r"Jusqu'à (\d+) ans", audience_str)

        if start_age_match or end_age_match:
            parsed_audience["age_range"] = {}

            if start_age_match:
                parsed_audience["age_range"]["from"] = int(start_age_match.group(1))

            if end_age_match:
                parsed_audience["age_range"]["to"] = int(end_age_match.group(1))

    return parsed_audience

def format_events_for_prompt(events):
    formatted_events = []
    for event in events:
        # Extract date strings
        start_time_str = event.get('Date de début', '')
        end_time_str = event.get('Date de fin', '')

        # Convert to Unix timestamps only if the date is valid
        start_time = int(datetime.strptime(start_time_str, "%Y-%m-%dT%H:%M:%S%z").timestamp() * 1000) if is_valid_date(start_time_str) else None
        end_time = int(datetime.strptime(end_time_str, "%Y-%m-%dT%H:%M:%S%z").timestamp() * 1000) if is_valid_date(end_time_str) else None

        formatted_event = {
            "ID": event.get('ID', ''),
            "URL": event.get('URL', ''),
            "Titre": event.get('Titre', ''),
            "Description": event.get('Description', ''),
            "Date de début": start_time,  # Converted timestamp or None
            "Date de fin": end_time,  # Converted timestamp or None
            "Nom du lieu": event.get('Nom du lieu', ''),
            "Adresse du lieu": event.get('Adresse du lieu', ''),
            "Code postal": event.get('Code postal', ''),
            "Mots clés": [keyword.strip() for keyword in event.get('Mots clés', '').split(',') if keyword.strip()],
            "Audience": parse_audience(event.get('audience', '')),
            "Ville": event.get('Ville', ''),
            "Coordonnées géographiques": event.get('Coordonnées géographiques', ''),
            "Type de prix": event.get('Type de prix', '')
        }
        formatted_events.append(formatted_event)
    return formatted_events


def convert_and_clean_excel(input_file_path, output_file_path, formatted_output_file_path):
    try:
        df = pd.read_excel(input_file_path)
        data = df.to_dict(orient='records')
        cleaned_data = clean_data(data)

        with open(output_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(cleaned_data, json_file, ensure_ascii=False, indent=2)

        print(f"Excel file has been cleaned and saved to {output_file_path}")
        formatted_events = format_events_for_prompt(cleaned_data)

        with open(formatted_output_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(formatted_events, json_file, ensure_ascii=False, indent=2)

        print(f"Formatted events have been saved to {formatted_output_file_path}")
    except FileNotFoundError:
        print(f"Error: The file '{input_file_path}' was not found. Please check the file path and try again.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Update the file paths
input_file_path = os.path.join('DateGeniusAI', 'data', 'que-faire-a-paris-2.xlsx')
cleaned_output_file_path = os.path.join('DateGeniusAI', 'data', 'cleaned_xlsx_data.json')
formatted_output_file_path = os.path.join('DateGeniusAI', 'data', 'formatted_xls_events.json')

convert_and_clean_excel(input_file_path, cleaned_output_file_path, formatted_output_file_path)


  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


Excel file has been cleaned and saved to DateGeniusAI/data/cleaned_xlsx_data.json
Formatted events have been saved to DateGeniusAI/data/formatted_xls_events.json


In [32]:
import json
import os

# Function to process "Mots clés" and ensure it's an array of strings
def process_mots_cles(events):
    for event in events:
        # If "Mots clés" exists and is a string, convert it to an array of strings
        if isinstance(event.get('Mots clés'), str):
            event['Mots clés'] = [keyword.strip() for keyword in event['Mots clés'].split(',') if keyword.strip()]
            event['audience'] = parse_audience(event.get('audience', '')),
    return events

def create_new_json_with_processed_mots_cles(input_json_file, output_json_file):
    try:
        # Load the existing formatted_xls_events.json
        with open(input_json_file, 'r', encoding='utf-8') as json_file:
            formatted_events = json.load(json_file)
        
        # Process "Mots clés" field
        processed_events = process_mots_cles(formatted_events)
        
        # Save the new file with the processed "Mots clés" field
        with open(output_json_file, 'w', encoding='utf-8') as json_file:
            json.dump(processed_events, json_file, ensure_ascii=False, indent=2)
        
        print(f"Processed events have been saved to {output_json_file}")
    except FileNotFoundError:
        print(f"Error: The file '{input_json_file}' was not found. Please check the file path and try again.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# File paths for input and output JSON files
input_json_file = os.path.join('DateGeniusAI', 'data', 'cleaned_xlsx_data.json')
output_json_file = os.path.join('DateGeniusAI', 'data', 'cleaned_xlsx_data.json')

# Run the function to process the "Mots clés" field and save to a new JSON file
create_new_json_with_processed_mots_cles(input_json_file, output_json_file)


Processed events have been saved to DateGeniusAI/data/cleaned_xlsx_data.json


In [8]:
import json

# Load the formatted events from the saved JSON file (formatted_xls_events.json)
formatted_output_file_path = os.path.join('DateGeniusAI', 'data', 'formatted_xls_events.json')

with open(formatted_output_file_path, 'r', encoding='utf-8') as json_file:
    formatted_events = json.load(json_file)

# Function to extract and save unique keywords
def extract_and_save_keywords(formatted_events, keywords_output_file_path):
    # Create a set to store unique keywords
    unique_keywords = set()

    # Loop through each event and extract the keywords
    for event in formatted_events:
        keywords = event.get('Mots clés', [])
        for keyword in keywords:
            unique_keywords.add(keyword.strip())  # Add each keyword to the set, ensuring it's unique

    # Convert the set to a list to make it JSON serializable
    unique_keywords_list = list(unique_keywords)

    # Save the unique keywords to a separate JSON file
    with open(keywords_output_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(unique_keywords_list, json_file, ensure_ascii=False, indent=2)

    print(f"Unique keywords have been saved to {keywords_output_file_path}")

# Path where the unique keywords JSON will be saved
keywords_output_file_path = os.path.join('DateGeniusAI', 'data', 'unique_keywords.json')

# Call the function to extract and save keywords from formatted_events
extract_and_save_keywords(formatted_events, keywords_output_file_path)



Unique keywords have been saved to DateGeniusAI/data/unique_keywords.json


In [12]:
import os
import json

# Load the formatted events from the saved JSON file (formatted_xls_events.json)
formatted_output_file_path = os.path.join('DateGeniusAI', 'data', 'cleaned_xlsx_data.json')

with open(formatted_output_file_path, 'r', encoding='utf-8') as json_file:
    formatted_events = json.load(json_file)

# Function to extract and save unique audience strings
def extract_and_save_audience(formatted_events, audience_output_file_path):
    # Create a set to store unique audience strings
    unique_audience = set()

    # Loop through each event and extract the audience
    for event in formatted_events:
        audience = event.get('audience', '')  # Assuming 'Audience' is the key for audience string
        if audience:
            unique_audience.add(audience.strip())  # Add audience string to the set, ensuring it's unique

    # Convert the set to a list to make it JSON serializable
    unique_audience_list = list(unique_audience)

    # Save the unique audience strings to a separate JSON file
    with open(audience_output_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(unique_audience_list, json_file, ensure_ascii=False, indent=2)

    print(f"Unique audience strings have been saved to {audience_output_file_path}")

# Path where the unique audience strings JSON will be saved
audience_output_file_path = os.path.join('DateGeniusAI', 'data', 'unique_audience.json')

# Call the function to extract and save audience from formatted_events
extract_and_save_audience(formatted_events, audience_output_file_path)


Unique audience strings have been saved to DateGeniusAI/data/unique_audience.json


In [8]:
import json
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from dotenv import load_dotenv

In [9]:
load_dotenv()
llm = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [10]:
def clean_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, ' ', text) if isinstance(text, str) else text

def prefilter_events(events, query):
    query_lower = query.lower()
    filtered_events = [
        event for event in events
        if any(keyword.lower() in query_lower for keyword in event.get('Mots clés', '').split(','))
        or any(keyword.lower() in query_lower for keyword in event.get('Titre', '').split())
        or any(keyword.lower() in query_lower for keyword in event.get('Description', '').split())
    ]
    return filtered_events if filtered_events else events 

def preprocess_events(events):
    for event in events:
        event['combined_text'] = f"{event['Titre']} {event['Description']} {event.get('Mots clés', '')}"
        event['combined_text'] = clean_html_tags(event['combined_text'])
    texts = [event['combined_text'] for event in events]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer, tfidf_matrix, events

def get_top_relevant_events(query, vectorizer, tfidf_matrix, events, top_n=20):
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    relevant_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return [events[idx] for idx in relevant_indices]

def generate_prompt(query, suggested_events):
    events_text = "\n".join([
        f"ID: {event['ID']}, Title: {event['Titre']}, Date: {event['Date de début']} to {event['Date de fin']}, Location: {event['Nom du lieu']}, {event['Adresse du lieu']}, {event['Code postal']}, {event['Ville']}"
        for event in suggested_events
    ])
    prompt = f"""
    You are an assistant that helps people find events in Paris. Based on the query below, suggest the best matching event from the list.
    
    Query: {query}
    
    Events:
    {events_text}

    Suggested Event:
    """
    return prompt

def find_event(query, events, vectorizer, tfidf_matrix, top_n=20):
    relevant_events = get_top_relevant_events(query, vectorizer, tfidf_matrix, events, top_n)
    prompt = generate_prompt(query, relevant_events)
    
    # Use LangChain to generate the response
    prompt_template = PromptTemplate(input_variables=["prompt"], template="{prompt}")
    formatted_prompt = prompt_template.format(prompt=prompt)
    response = llm(formatted_prompt)
    
    return response, relevant_events

def load_formatted_events(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def save_filtered_events(filtered_events, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(filtered_events, file, ensure_ascii=False, indent=2)

def find_and_generate_prompt(user_query, events, top_n=20):
    filtered_events = prefilter_events(events, user_query)
    vectorizer, tfidf_matrix, processed_events = preprocess_events(filtered_events)
    suggested_event, relevant_events = find_event(user_query, processed_events, vectorizer, tfidf_matrix, top_n)
    
    save_filtered_events(relevant_events, 'filtered_events.json')
    
    return suggested_event

formatted_events_file_path = 'data/formatted_xls_events.json'
formatted_events = load_formatted_events(formatted_events_file_path)

# Example usage
user_query = "J'aimerai bien voir un truque sportive"
suggested_event = find_and_generate_prompt(user_query, formatted_events)
print("Suggested Event:", suggested_event)

Suggested Event: 
    ID: 56259, Title: Paris Sportives : activités football et marche sportive par LES ENFANTS DE LA GOUTTE D'OR, Date: 2024-01-01T00:00:00+00:00 to 2024-12-31T23:59:59+00:00, Location: Square Léon, 20 Rue des Gardes, 75018, Paris


In [7]:
# Optional: Integrate with a Flask application for deployment

from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/chatbot', methods=['POST'])
def chatbot():
    user_input = request.json.get('message')
    response = generate_response(user_input)
    return jsonify({'response': response})

if __name__ == '__main__':
    app.run(port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
