In [None]:
from googletrans import Translator
import pandas as pd
from tqdm import tqdm
import logging
from transformers import pipeline


# 1 Translation Pipeline

In [None]:


# Initialize Google Translator
translator = Translator()

# Step 1: Load the CSV file
df = pd.read_csv('../data/activities.csv')

# Step 2: Create a new column 'prompt' by translating 'activity_name' and 'activity_description'
prompts = []

# Use tqdm to visualize progress
for i, row in tqdm(df.iterrows(), total=len(df), desc="Translating and creating prompts"):
    try:
        # Translate 'activity_name' and 'activity_description' to English
        translated_name = translator.translate(row['activity_name'], dest='en').text
        translated_desc = translator.translate(row['activity_description'], dest='en').text

        # Combine them into the prompt
        prompt = (f"This is an event called '{translated_name}', part of a broader effort during Mobility Week "
                  f"to promote sustainable mobility by local governments. Its complete description is: '{translated_desc}'")
        prompts.append(prompt)

    except Exception as e:
        # If any translation fails, log the error and use the original text
        print(f"Error translating row {i}: {e}")
        prompts.append(
            f"This is an event called '{row['activity_name']}', part of a broader effort during Mobility Week "
            f"to promote sustainable mobility by local governments. Its complete description is: '{row['activity_description']}'")

# Step 3: Add the 'prompt' column to the DataFrame
df['prompt'] = prompts

# Step 4: Save the updated DataFrame to 'activities_t.csv'
df.to_csv('../data/activities_t.csv', index=False)


# 2 Initial 0-Shot Classifier

In [2]:

logging.basicConfig(level=logging.INFO)
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")
# Primary classification categories (first level)
primary_category_dict = {
    "This activity is a cycling march or bicycle route, part of a broader effort to promote sustainable mobility and an active lifestyle.": 'Cycling Event',
    "This activity is a walking tour, guided tour, or organized walk": 'Walking Tour',
    "This activity is a road safety course, driving workshop or road safety conference. It aims to educate participants on traffic safety and improve their driving or cycling skills.": 'Road Safety Workshop',
    "This activity is a Car-Free Day event that involves traffic closures or restrictions to promote the use of sustainable mobility options": 'Car-Free Day',
    "This activity is a Parking Day event where road area or parking spaces are transformed or occupied to promote revitalization of urban space.": 'Parking Day',
    "This activity is an exhibition, fair, or display related to mobility, such as showcasing new public transport fleets, alternative vehicles, or urban planning solutions.": 'Exhibition',
    "This activity is an inauguration or public presentation of new infrastructure, such as new bike lanes, pedestrian zones, or public spaces dedicated to sustainable mobility.": 'Infrastructure Inauguration',
    "This activity is a conference, workshop, talk, or public presentation focused on mobility or related topics, aiming to raise awareness, educate, or discuss mobility practices.":"Conference"
}


# Subcategories for Road Safety Workshops
road_safety_subcategory_dict = {
    "This road safety workshop is targeted at children or schools, aiming to teach traffic safety and traffic awareness from an early age.": 'Schools',
    "This road safety workshop is designed for cyclists or alternative mobility users to improve their safety awareness and skills on the road.": 'Cyclists',
    "This road safety workshop is directed at automobile drivers to enhance their knowledge of traffic laws and safe driving techniques.": 'Drivers'
}

conference_subcategory_dict = {
    "This conference, workshop, or talk is designed for the general public to discuss mobility issues, raise awareness, and encourage sustainable practices.": 'General Audience',
    "This conference, workshop, or talk is specifically targeted at children or schools, focusing on education and promoting safe and sustainable mobility among younger audiences.": 'Children/Schools',
    "This conference, workshop, or talk is aimed at professionals or policymakers in the mobility sector, discussing regulations, innovations, or infrastructure planning for sustainable mobility.": 'Professionals/Policymakers'
}

# Step 1: Read the activities.csv file (which contains the translated descriptions and prompts)
df = pd.read_csv('../data/activities_t.csv')

# List to store classification results
results = []

# Step 2: Primary classification
for i, activity in tqdm(df.iterrows(), total=len(df), desc="Classifying activities"):
    try:
        # Use the 'prompt' column for classification
        prompt = activity['prompt']
        
        # Perform zero-shot classification using the primary category dictionary
        result = classifier(prompt, candidate_labels=list(primary_category_dict.keys()))
        subclass_result = None
        # Extract the top category and its score
        top_primary_category = primary_category_dict[result['labels'][0]]
        top_primary_score = result['scores'][0]  # Get the score of the top category

        if top_primary_category == 'Road Safety Workshop':
            # Perform subclassification for Road Safety Workshops
            subclass_result = classifier(prompt, candidate_labels=list(road_safety_subcategory_dict.keys()))
            top_subcategory = road_safety_subcategory_dict[subclass_result['labels'][0]]
            top_subcategory_score = subclass_result['scores'][0]  # Get the score of the top subclass category
        elif top_primary_category == 'Conference':
            # Perform subclassification for Road Safety Workshops
            subclass_result = classifier(prompt, candidate_labels=list(conference_subcategory_dict.keys()))
            top_subcategory = conference_subcategory_dict[subclass_result['labels'][0]]
            top_subcategory_score = subclass_result['scores'][0]  # Get the score of the top subclass category
        else:
            # For all other categories, no subclassification is needed
            top_subcategory = None
            top_subcategory_score = None

        # Append classification result to results list, including scores
        results.append({
            "ID": activity['ID'],
            "city": activity['city'],
            "activity_name": activity['activity_name'],
            "activity_description": activity['activity_description'],
            "primary_result": result,
            "primary_classification": top_primary_category,
            "primary_score": top_primary_score,  # Store the score for primary classification
            "secondary_result": subclass_result,
            "secondary_classification": top_subcategory,
            "secondary_score": top_subcategory_score  # Store the score for subclassification
        })

    except Exception as e:
        print(f"Error classifying activity: {activity}\nError: {str(e)}\n")

# Step 3: Save the classification results to a new CSV file
res_df = pd.DataFrame(results)
res_df.to_csv('../data/activities_tc.csv', index=False)

print("Classification completed and saved to 'activities_tc.csv'.")






Classifying activities: 100%|██████████| 1039/1039 [2:24:28<00:00,  8.34s/it] 


Classification completed and saved to 'activities_tc.csv'.
