In [1]:
import pandas as pd
import json
import re
from bs4 import BeautifulSoup

french_chars = "àâäçéèêëîïôùûüÿÀÂÄÇÉÈÊËÎÏÔÙÛÜŸ.:-/+’',"

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text(separator=" ")

def find_and_replace_ambiguous_unicode(text, replacement=""):
    text = text.replace('\n', ' ').replace('\t', ' ')
    non_ascii_pattern = re.compile(r'[^a-zA-Z0-9\s' + re.escape(french_chars) + r']')
    return non_ascii_pattern.sub(replacement, text)

def clean_data(data, replacement=""):
    cleaned_data = []
    for item in data:
        cleaned_item = {k: find_and_replace_ambiguous_unicode(remove_html_tags(str(v)), replacement) for k, v in item.items()}
        cleaned_data.append(cleaned_item)
    return cleaned_data

def format_events_for_prompt(events):
    formatted_events = [
        {
            "ID": event.get('ID', ''),
            "URL": event.get('URL', ''),
            "Titre": event.get('Titre', ''),
            "Description": event.get('Description', ''),
            "Date de début": event.get('Date de début', ''),
            "Date de fin": event.get('Date de fin', ''),
            "Nom du lieu": event.get('Nom du lieu', ''),
            "Adresse du lieu": event.get('Adresse du lieu', ''),
            "Code postal": event.get('Code postal', ''),
            "Mots clés": event.get('Mots clés', ''),
            "Ville": event.get('Ville', ''),
            "Coordonnées géographiques": event.get('Coordonnées géographiques', ''),
        }
        for event in events
    ]
    return formatted_events


def convert_and_clean_excel(input_file_path, output_file_path, formatted_output_file_path):
    df = pd.read_excel(input_file_path)
    data = df.to_dict(orient='records')
    cleaned_data = clean_data(data)

    with open(output_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(cleaned_data, json_file, ensure_ascii=False, indent=2)

    print(f"Excel file has been cleaned and saved to {output_file_path}")
    formatted_events = format_events_for_prompt(cleaned_data)

    with open(formatted_output_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(formatted_events, json_file, ensure_ascii=False, indent=2)

    print(f"Formatted events have been saved to {formatted_output_file_path}")

input_file_path = 'data/que-faire-a-paris-2.xlsx'
cleaned_output_file_path = 'data/cleaned_xlsx_data.json'
formatted_output_file_path = 'data/formatted_xls_events.json'
convert_and_clean_excel(input_file_path, cleaned_output_file_path, formatted_output_file_path)


  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


Excel file has been cleaned and saved to data/cleaned_xlsx_data.json
Formatted events have been saved to data/formatted_xls_events.json


In [2]:
import json
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from dotenv import load_dotenv

In [3]:
load_dotenv()
llm = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

  warn_deprecated(


In [6]:
def clean_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, ' ', text) if isinstance(text, str) else text

def prefilter_events(events, query):
    query_lower = query.lower()
    filtered_events = [
        event for event in events
        if any(keyword.lower() in query_lower for keyword in event.get('Mots clés', '').split(','))
        or any(keyword.lower() in query_lower for keyword in event.get('Titre', '').split())
        or any(keyword.lower() in query_lower for keyword in event.get('Description', '').split())
    ]
    return filtered_events if filtered_events else events 

def preprocess_events(events):
    for event in events:
        event['combined_text'] = f"{event['Titre']} {event['Description']} {event.get('Mots clés', '')}"
        event['combined_text'] = clean_html_tags(event['combined_text'])
    texts = [event['combined_text'] for event in events]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer, tfidf_matrix, events

def get_top_relevant_events(query, vectorizer, tfidf_matrix, events, top_n=20):
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    relevant_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return [events[idx] for idx in relevant_indices]

def generate_prompt(query, suggested_events):
    events_text = "\n".join([
        f"ID: {event['ID']}, Title: {event['Titre']}, Date: {event['Date de début']} to {event['Date de fin']}, Location: {event['Nom du lieu']}, {event['Adresse du lieu']}, {event['Code postal']}, {event['Ville']}"
        for event in suggested_events
    ])
    prompt = f"""
    You are an assistant that helps people find events in Paris. Based on the query below, suggest the best matching event from the list.
    
    Query: {query}
    
    Events:
    {events_text}

    Suggested Event:
    """
    return prompt

def find_event(query, events, vectorizer, tfidf_matrix, top_n=20):
    relevant_events = get_top_relevant_events(query, vectorizer, tfidf_matrix, events, top_n)
    prompt = generate_prompt(query, relevant_events)
    
    # Use LangChain to generate the response
    prompt_template = PromptTemplate(input_variables=["prompt"], template="{prompt}")
    formatted_prompt = prompt_template.format(prompt=prompt)
    response = llm(formatted_prompt)
    
    return response, relevant_events

def load_formatted_events(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def save_filtered_events(filtered_events, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(filtered_events, file, ensure_ascii=False, indent=2)

def find_and_generate_prompt(user_query, events, top_n=20):
    filtered_events = prefilter_events(events, user_query)
    vectorizer, tfidf_matrix, processed_events = preprocess_events(filtered_events)
    suggested_event, relevant_events = find_event(user_query, processed_events, vectorizer, tfidf_matrix, top_n)
    
    save_filtered_events(relevant_events, 'filtered_events.json')
    
    return suggested_event

formatted_events_file_path = 'data/formatted_xls_events.json'
formatted_events = load_formatted_events(formatted_events_file_path)

# Example usage
user_query = "J'aimerai bien voir un truque sportive"
suggested_event = find_and_generate_prompt(user_query, formatted_events)
print("Suggested Event:", suggested_event)

  warn_deprecated(


Suggested Event: 
    ID: 56259, Title: Paris Sportives : activités football et marche sportive par LES ENFANTS DE LA GOUTTE D'OR, Date: 2024-01-01T00:00:00+00:00 to 2024-12-31T23:59:59+00:00, Location: Square Léon, 20 Rue des Gardes, 75018, Paris


In [7]:
# Optional: Integrate with a Flask application for deployment

from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/chatbot', methods=['POST'])
def chatbot():
    user_input = request.json.get('message')
    response = generate_response(user_input)
    return jsonify({'response': response})

if __name__ == '__main__':
    app.run(port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
