In [22]:
import pandas as pd
import json
import re

def find_and_replace_ambiguous_unicode(text, replacement=""):
    # Regex pattern to match non-ASCII characters
    non_ascii_pattern = re.compile(r'[^\x00-\x7F]')
    return non_ascii_pattern.sub(replacement, text)

def clean_data(data, replacement=""):
    cleaned_data = []
    for item in data:
        cleaned_item = {k: find_and_replace_ambiguous_unicode(str(v), replacement) for k, v in item.items()}
        cleaned_data.append(cleaned_item)
    return cleaned_data

def format_events_for_prompt(events):
    formatted_events = [
        {
            "ID": event.get('ID', ''),
            "URL": event.get('URL', ''),
            "Titre": event.get('Titre', ''),
            "Description": event.get('Description', ''),
            "Date de début": event.get('Date de début', ''),
            "Date de fin": event.get('Date de fin', ''),
            "Nom du lieu": event.get('Nom du lieu', ''),
            "Adresse du lieu": event.get('Adresse du lieu', ''),
            "Code postal": event.get('Code postal', ''),
            "Ville": event.get('Ville', ''),
            "Coordonnées géographiques": event.get('Coordonnées géographiques', '')
        }
        for event in events
    ]
    return formatted_events

def convert_and_clean_excel(input_file_path, output_file_path, formatted_output_file_path):
    # Load the Excel file
    df = pd.read_excel(input_file_path)

    # Convert the DataFrame to a dictionary
    data = df.to_dict(orient='records')

    # Clean the data to remove ambiguous Unicode characters
    cleaned_data = clean_data(data)

    # Save the cleaned data to a JSON file
    with open(output_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(cleaned_data, json_file, ensure_ascii=False, indent=2)

    print(f"Excel file has been cleaned and saved to {output_file_path}")

    # Format the events
    formatted_events = format_events_for_prompt(cleaned_data)

    # Save the formatted events to a new JSON file
    with open(formatted_output_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(formatted_events, json_file, ensure_ascii=False, indent=2)

    print(f"Formatted events have been saved to {formatted_output_file_path}")

# File paths
input_file_path = 'que-faire-a-paris-.xlsx'
cleaned_output_file_path = 'cleaned_xlsx_data.json'
formatted_output_file_path = 'formatted_xls_events.json'

# Convert, clean, and format the Excel file
convert_and_clean_excel(input_file_path, cleaned_output_file_path, formatted_output_file_path)


ModuleNotFoundError: No module named 'sklearn'

In [4]:
import pandas as pd
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
import os
from dotenv import load_dotenv

In [5]:
# Import configuration
load_dotenv()

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

In [9]:
# Load formatted events from the JSON file
def load_formatted_events(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)
        
# Clean HTML tags from text
def clean_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, ' ', text) if isinstance(text, str) else text
    
# Preprocess events to create a TF-IDF matrix
def preprocess_events(events):
    for event in events:
        event['combined_text'] = f"{event['Titre']} {event['Description']} {event.get('Mots clés', '')}"
        event['combined_text'] = clean_html_tags(event['combined_text'])
    texts = [event['combined_text'] for event in events]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer, tfidf_matrix, events

# Get top N relevant events based on query
def get_top_relevant_events(query, vectorizer, tfidf_matrix, events, top_n=10):
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    relevant_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return [events[idx] for idx in relevant_indices]

# Generate the prompt with limited events
def generate_prompt(query, relevant_events):
    events_text = "\n".join([
        f"ID: {event['ID']}, Title: {event['Titre']}, Date: {event['Date de début']} to {event['Date de fin']}, Location: {event['Nom du lieu']}, {event['Adresse du lieu']}, {event['Code postal']}, {event['Ville']}" 
        for event in relevant_events
    ])
    prompt = f"""
    You are an assistant that helps people find events in Paris. Based on the query below, suggest the best matching event from the list.
    
    Query: {query}
    
    Events:
    {events_text}

    Suggested Event:
    """
    return prompt

# Find the best matching event
def find_event(query, events, vectorizer, top_n=10):
    relevant_events = get_top_relevant_events(query, events, vectorizer, top_n)
    prompt = generate_prompt(query, relevant_events)
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an assistant that helps people find events in Paris."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150
    )
    return response.choices[0].message.content

# Path to your formatted JSON file
formatted_events_file_path = 'formatted_xlsx_events.json'

# Load and preprocess the data
formatted_events = load_formatted_events(formatted_events_file_path)
vectorizer, tfidf_matrix, processed_events = preprocess_events(formatted_events)

# Example usage
user_query = "I'd like to go to an expoisition on modern art"
suggested_event = find_event(user_query, vectorizer, tfidf_matrix, processed_events)
print("Suggested Event:", suggested_event)

Suggested Event: Suggested Event:
ID: 63487, Title: "L'Anne d'avant" (The Year Before): an exhibition in the 10th arrondissement of Paris takes you right to the heart of French athletes' preparation., Date: 2024-07-26T02:00:00+02:00 to 2024-09-09T01:59:59+02:00, Location: TEP de la Grange-aux-Belles / Agnes Tirop, 8 rue Georg Friedrich Haendel, 75010, Paris

This event might not be specifically about modern art, but it involves an exhibition related to French athletes' preparation, which could also be of interest if you


In [None]:
# Optional: Integrate with a Flask application for deployment

from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/chatbot', methods=['POST'])
def chatbot():
    user_input = request.json.get('message')
    response = generate_response(user_input)
    return jsonify({'response': response})

if __name__ == '__main__':
    app.run(port=5000)