In [1]:
import mlflow
mlflow.set_tracking_uri("http://3.145.180.106:5000")
mlflow.set_experiment("Cincinnati Crime Chatbot")

import transformers
import huggingface_hub

print("✅ All libraries imported successfully!")


✅ All libraries imported successfully!


In [2]:
# --- Imports ---
import pandas as pd
import time
from transformers import pipeline
from huggingface_hub import hf_hub_download
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)


# --- Load Cincinnati crime data from Hugging Face ---
def load_crime_data():
    csv_path = hf_hub_download(
        repo_id="mlsystemsg1/cincinnati-crime-data",
        repo_type="dataset",
        filename="calls_for_service_latest.csv"
    )
    df = pd.read_csv(csv_path, low_memory=False)

    # Normalize column names (lowercase)
    df.columns = [col.lower() for col in df.columns]
    return df

# --- Filter relevant rows based on question ---
def get_relevant_rows(question, df, num_rows=10):
    q = question.lower()
    filtered = df.copy()

    # Filter by neighborhood
    if 'sna_neighborhood' in df.columns:
        for neighborhood in df['sna_neighborhood'].dropna().unique():
            if neighborhood.lower() in q:
                filtered = filtered[filtered['sna_neighborhood'].str.contains(neighborhood, case=False, na=False)]
                break

    # Filter by offense type
    if 'incident_type_desc' in df.columns:
        for offense in df['incident_type_desc'].dropna().unique():
            if offense.lower() in q:
                filtered = filtered[filtered['incident_type_desc'].str.contains(offense, case=False, na=False)]
                break

    # Sort by date if user asks about recent crimes
    if 'create_time_incident' in df.columns and any(word in q for word in ["last", "latest", "recent"]):
        filtered = filtered.sort_values(by='create_time_incident', ascending=False)

    return filtered.head(min(num_rows, len(filtered)))


# --- Summarize or extract information ---
def generate_summary(question, filtered_df):
    q = question.lower()

    if filtered_df.empty:
        return "No matching records found."

    if any(word in q for word in ["how many", "number of", "count"]):
        return f"{len(filtered_df)} incidents matched your query."

    if "most common" in q and 'incident_type_desc' in filtered_df.columns:
        most_common = filtered_df['incident_type_desc'].value_counts().idxmax()
        count = filtered_df['incident_type_desc'].value_counts().max()
        return f"The most common crime is {most_common} with {count} incidents."


    # Fallback: return example incidents
    examples = []
    for _, row in filtered_df.iterrows():
        try:
            date = row.get('create_time_incident', 'N/A')
            offense = row.get('incident_type_desc', 'N/A')
            neighborhood = row.get('sna_neighborhood', 'N/A')
            incident = row.get('event_number', 'N/A')
            examples.append(f"On {date}, a {offense} occurred in {neighborhood} (Incident #{incident}).")
        except:
            continue
    return "\n".join(examples[:10])

# --- Use the LLM to generate an answer and log with MLflow ---
def answer_with_llm(question, data_rows, model, model_name="google/flan-t5-base", prompt_version="v3"):
    if data_rows.empty:
        return "Sorry, I couldn't find any data matching that question."

    context = generate_summary(question, data_rows)
    prompt = f"""
You are a helpful assistant analyzing Cincinnati crime data.

Here are some relevant data points:
{context}

Now answer this question based on the above:
{question}
    """.strip()

    start_time = time.time()
    with mlflow.start_run():
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("max_new_tokens", 150)
        mlflow.log_param("num_rows_context", len(data_rows))
        mlflow.log_param("prompt_version", prompt_version)
        mlflow.log_param("question", question)
        mlflow.log_text(prompt, "prompt.txt")

        result = model(prompt, max_new_tokens=150)[0]['generated_text']
        answer = result.strip()
        latency = time.time() - start_time

        mlflow.log_text(answer, "answer.txt")
        mlflow.log_metric("response_length", len(answer))
        mlflow.log_metric("latency_sec", latency)

        return answer

# --- Chatbot loop ---
def run_single_chatbot_turn(df, model, question):
    if question.lower() in ['exit', 'quit']:
        return "Goodbye!"

    sample_rows = get_relevant_rows(question, df)
    answer = answer_with_llm(question, sample_rows, model)
    return answer

import ipywidgets as widgets
from IPython.display import display

def ask_question_with_widget(df, model):
    input_box = widgets.Text(
        placeholder='Ask a question about crime in Cincinnati...',
        description='Question:',
        layout=widgets.Layout(width='100%')
    )

    output = widgets.Output()

    def on_submit(change):
        output.clear_output()
        question = change['new']
        answer = run_single_chatbot_turn(df, model, question)
        with output:
            print("Bot:", answer)

    input_box.observe(on_submit, names='value')
    display(input_box, output)


# --- Load data and model ---
print("Loading dataset and model...")
df = load_crime_data()
print("Columns in dataset:", df.columns.tolist())  # helpful for debugging
llm_model = pipeline("text2text-generation", model="google/flan-t5-base")

# --- Launch chatbot widget ---
print("\n Chatbot is ready! Ask your question below:")
ask_question_with_widget(df, llm_model)


Loading dataset and model...
Columns in dataset: ['address_x', 'agency', 'create_time_incident', 'disposition_text', 'event_number', 'incident_type_id', 'incident_type_desc', 'priority', 'priority_color', 'arrival_time_primary_unit', 'closed_time_incident', 'dispatch_time_primary_unit', 'beat', 'district', 'cpd_neighborhood', 'community_council_neighborhood', 'latitude_x', 'longitude_x', 'sna_neighborhood']


Device set to use cpu



 Chatbot is ready! Ask your question below:


Text(value='', description='Question:', layout=Layout(width='100%'), placeholder='Ask a question about crime i…

Output()

In [3]:
print("Tracking to:", mlflow.get_tracking_uri())

exp = mlflow.get_experiment_by_name("Cincinnati Crime Chatbot")
print("Experiment ID:", exp.experiment_id)
print("Artifact Location:", exp.artifact_location)


Tracking to: http://3.145.180.106:5000
Experiment ID: 599682738248006782
Artifact Location: mlflow-artifacts:/599682738248006782
