In [97]:
# Run helper.ipynb first

import pandas as pd
import numpy as np
import pickle
import joblib
import os
import string
import time
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import load_npz
import ipywidgets as widgets
from IPython.display import display, clear_output

In [76]:
VECTORIZER_FILE = 'tfidf_vectorizer.joblib'
MATRIX_FILE = 'tfidf_matrix.npz'
CITY_ORDER_FILE = 'city_order.pkl'
CSV_FILE_PATH = 'Cities_data.csv'

In [77]:
vectorizer = joblib.load(VECTORIZER_FILE)
tfidf_matrix = load_npz(MATRIX_FILE)
with open(CITY_ORDER_FILE, 'rb') as f:
    city_order = pickle.load(f)
print("Successfully loaded vectorizer, TF-IDF matrix, and city order.")
print(f"TF-IDF Matrix shape: {tfidf_matrix.shape}, Cities loaded: {len(city_order)}")

Successfully loaded vectorizer, TF-IDF matrix, and city order.
TF-IDF Matrix shape: (50, 17846), Cities loaded: 50


In [78]:
df_cities = pd.read_csv(CSV_FILE_PATH)
print(f"Loaded CSV data. Shape: {df_cities.shape}")
df_cities = df_cities.set_index('City')
print(f"Aligned CSV data with city order. New shape: {df_cities.shape}")

Loaded CSV data. Shape: (50, 23)
Aligned CSV data with city order. New shape: (50, 22)


In [79]:
numerical_cols_for_sliders = [
    'Population (Urban) in Mil', 'GDP pc (in thousand USD)',
    'Average Highs in Peak Summer', 'Average Lows in Peak Winter',
    'Average Yearly Rainfall (in m)', 'Sunshine (annual hours)',
    'Unemployment Rate', 'Cost of Living (in thousand USD)',
    'HDI', 'Life Expectancy', 'PM2.5', 
    'CPI', 'Crime Index', 'Metro Length (km)'
]

In [80]:
scaler = MinMaxScaler()
df_normalized = df_cities.copy()
df_normalized[numerical_cols_for_sliders] = scaler.fit_transform(df_cities[numerical_cols_for_sliders])

print("Normalized numerical columns (0-1 range).")
display(df_normalized[numerical_cols_for_sliders].head())

Normalized numerical columns (0-1 range).


Unnamed: 0_level_0,Population (Urban) in Mil,GDP pc (in thousand USD),Average Highs in Peak Summer,Average Lows in Peak Winter,Average Yearly Rainfall (in m),Sunshine (annual hours),Unemployment Rate,Cost of Living (in thousand USD),HDI,Life Expectancy,PM2.5,CPI,Crime Index,Metro Length (km)
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Honolulu,0.04439,0.462087,0.548837,0.83871,0.159624,0.766812,0.088496,0.566186,0.819231,0.574803,0.009091,0.603774,0.683652,0.012515
San Francisco,0.208385,1.0,0.065116,0.477419,0.234742,0.778166,0.212389,0.740498,0.773077,0.574803,0.027273,0.603774,0.961783,0.152125
Dubai,0.225647,0.283614,1.0,0.677419,0.0,1.0,0.132743,0.445609,0.780769,0.858268,0.409091,0.584906,0.0,0.064447
Los Angeles,0.745993,0.576576,0.423256,0.506452,0.131455,0.862009,0.309735,0.585845,0.773077,0.574803,0.064646,0.603774,0.798301,0.125872
Perth,0.134402,0.281783,0.548837,0.474194,0.305164,0.848035,0.20354,0.369594,0.903846,0.937008,0.026263,0.716981,0.543524,0.0


In [81]:
keyword_csv_mapping = {
    # Climate
    'low rainfall': ('Average Yearly Rainfall (in m)', -1.0),
    'high rainfall': ('Average Yearly Rainfall (in m)', 1.0),
    'dry': ('Average Yearly Rainfall (in m)', -0.8),
    'wet': ('Average Yearly Rainfall (in m)', 0.8),
    'sunny': ('Sunshine (annual hours)', 1.0),
    'lots of sun': ('Sunshine (annual hours)', 1.0),
    'cloudy': ('Sunshine (annual hours)', -1.0),
    'less sun': ('Sunshine (annual hours)', -1.0),
    'warm winter': ('Average Lows in Peak Winter', 1.0),
    'mild winter': ('Average Lows in Peak Winter', 1.0),
    'cold winter': ('Average Lows in Peak Winter', -1.0),
    'snowy winter': ('Average Lows in Peak Winter', -1.0),
    'hot summer': ('Average Highs in Peak Summer', 1.0),
    'warm summer': ('Average Highs in Peak Summer', 0.8),
    'mild summer': ('Average Highs in Peak Summer', -1.0),
    'cool summer': ('Average Highs in Peak Summer', -1.0),

    # Economy / Living
    'affordable': ('Cost of Living (in thousand USD)', -1.0),
    'cheap': ('Cost of Living (in thousand USD)', -1.0),
    'low cost living': ('Cost of Living (in thousand USD)', -1.0),
    'expensive': ('Cost of Living (in thousand USD)', 1.0),
    'high cost living': ('Cost of Living (in thousand USD)', 1.0),
    'high gdp': ('GDP pc (in thousand USD)', 1.0),
    'strong economy': ('GDP pc (in thousanHigh d USD)', 1.0),
    'rich': ('GDP pc (in thousand USD)', 1.0),
    'low unemployment': ('Unemployment Rate', -1.0),
    'jobs': ('Unemployment Rate', -0.7),
    'high unemployment': ('Unemployment Rate', 1.0),

    # Safety / Environment
    'safe': ('Crime Index', -1.0),
    'low crime': ('Crime Index', -1.0),
    'high crime': ('Crime Index', 1.0),
    'dangerous': ('Crime Index', 1.0),
    'clean air': ('PM2.5', -1.0),
    'low pollution': ('PM2.5', -1.0),
    'polluted': ('PM2.5', 1.0),
    'smog': ('PM2.5', 1.0),

    # Infrastructure / Size
    'good metro': ('Metro Length (km)', 1.0),
    'subway': ('Metro Length (km)', 1.0),
    'public transport': ('Metro Length (km)', 0.8),
    'big city': ('Population (Urban) in Mil', 1.0),
    'large population': ('Population (Urban) in Mil', 1.0),
    'small city': ('Population (Urban) in Mil', -1.0),
    'low population': ('Population (Urban) in Mil', -1.0),
}

print(f"Defined keyword mapping for {len(keyword_csv_mapping)} phrases.")

Defined keyword mapping for 42 phrases.


In [82]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [83]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    lemmatized_tokens = []
    for token in tokens:
        if token.isalpha() and token not in stop_words and token not in string.punctuation:
            lemma = lemmatizer.lemmatize(token)
            lemmatized_tokens.append(lemma)
    return " ".join(lemmatized_tokens)

In [84]:
keyword_input = widgets.Text(
    value='',
    placeholder='Enter keywords (e.g., beaches history safe affordable)...',
    description='Keywords:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='95%')
)

In [93]:
sliders = {}
slider_widgets = []
for col in numerical_cols_for_sliders:
    desc = col.replace('(in thousand USD)', 'k$').replace('(in m)', 'm')
    desc = desc.replace('(annual hours)', 'hrs').replace('(Urban) in Mil', 'M pop')
    desc = desc.replace('Average ', 'Avg ').replace('Peak ', '')
    desc = desc.replace('Yearly ', 'Yrly ')

    sliders[col] = widgets.FloatSlider(
        value=0.0,
        min=-1.0,
        max=1.0,
        step=0.1,
        description=f"{desc}:",
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        readout_format='.1f',
        layout=widgets.Layout(width='95%', height='auto'),
        style={'description_width': 'initial'}
    )
    slider_widgets.append(sliders[col])

In [94]:
calculate_button = widgets.Button(
    description="Find Matching Cities",
    button_style='info',
    tooltip='Click to calculate city rankings based on keywords and slider weights',
    icon='search'
)
output_area = widgets.Output()

In [95]:
TEXT_SIMILARITY_WEIGHT = 0.4
CSV_CRITERIA_WEIGHT = 0.6

def on_calculate_button_clicked(b):
    with output_area:
        clear_output(wait=True)
        print("Calculating...")
        time.sleep(0.5)

        keywords_raw = keyword_input.value
        slider_weights = {col: slider.value for col, slider in sliders.items()}

        text_scores_normalized = np.zeros(len(city_order))
        processed_query = ""
        if keywords_raw:
            processed_query = preprocess_text(keywords_raw)
            if processed_query:
                try:
                    query_vector = vectorizer.transform([processed_query])
                    text_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
                    if np.ptp(text_scores) > 0:
                       text_scaler = MinMaxScaler()
                       text_scores_normalized = text_scaler.fit_transform(text_scores.reshape(-1, 1)).flatten()
                    else:
                       text_scores_normalized = np.zeros(len(city_order))
                    print(f"Processed Keywords: '{processed_query}' -> Text scores calculated.")
                except Exception as e:
                    print(f"Error during text vectorization/similarity: {e}")
            else:
                print("Keywords yielded no processable terms.")
        else:
            print("No keywords entered, using only slider criteria.")

        csv_scores = np.zeros(len(city_order))
        implied_weights = {col: 0.0 for col in numerical_cols_for_sliders}
        matched_kws_details = []

        if keywords_raw:
             keyword_lower = keywords_raw.lower()
             for phrase, (col, direction) in keyword_csv_mapping.items():
                 if phrase in keyword_lower:
                    if col in implied_weights:
                         implied_weights[col] += direction
                         matched_kws_details.append(f"'{phrase}' -> {direction:+.1f} for {col}")

        if matched_kws_details:
            print("Implied CSV weights from keywords:")
            for detail in matched_kws_details: print(f"  - {detail}")

        for i, city in enumerate(city_order):
            city_csv_score = 0
            for col in numerical_cols_for_sliders:
                total_weight = slider_weights.get(col, 0.0) + implied_weights.get(col, 0.0)
                total_weight = np.clip(total_weight, -1.0, 1.0)

                if abs(total_weight) > 0.01:
                    try:
                        normalized_value = df_normalized.loc[city, col]

                        if pd.isna(normalized_value):
                           print(f"Warning: NaN value found for {city}, {col} after normalization/imputation.")
                           continue

                        if total_weight > 0:
                            city_csv_score += total_weight * normalized_value
                        else:
                            city_csv_score += abs(total_weight) * (1.0 - normalized_value)
                    except KeyError:
                         print(f"Warning: Could not find city '{city}' or column '{col}' during CSV scoring.")
                    except Exception as e:
                         print(f"Error calculating CSV score for {city}, {col}: {e}")

            csv_scores[i] = city_csv_score

        csv_scores_normalized = np.zeros(len(city_order))
        if np.ptp(csv_scores) > 0:
            csv_scaler = MinMaxScaler()
            csv_scores_normalized = csv_scaler.fit_transform(csv_scores.reshape(-1, 1)).flatten()
        print("CSV criteria scores calculated.")

        final_scores = (TEXT_SIMILARITY_WEIGHT * text_scores_normalized) + \
                       (CSV_CRITERIA_WEIGHT * csv_scores_normalized)
        city_final_scores = list(zip(city_order, final_scores))
        ranked_cities = sorted(city_final_scores, key=lambda item: item[1], reverse=True)

        print("\n--- Top Matching Cities ---")
        if not ranked_cities:
            print("No results found.")
        else:
            num_results_to_show = 15
            for i, (city, score) in enumerate(ranked_cities[:num_results_to_show]):
                 try:
                     cost = df_cities_raw.loc[city, 'Cost of Living (in thousand USD)']
                     sun = df_cities_raw.loc[city, 'Sunshine (annual hours)']
                     print(f"{i+1}. {city:<15} (Score: {score:.4f}) [Cost: {cost:.2f}k$, Sun: {sun:.0f}hrs]")
                 except KeyError:
                      print(f"{i+1}. {city:<15} (Score: {score:.4f})")
                 except Exception as e:
                      print(f"Error displaying extra info for {city}: {e}")
                      print(f"{i+1}. {city:<15} (Score: {score:.4f})")


            if len(ranked_cities) > num_results_to_show:
                print("...")
        print("-" * 30)

calculate_button.on_click(on_calculate_button_clicked)

In [96]:
ui = widgets.VBox([
    widgets.HTML("<h2>City Matchmaker</h2>"),
    keyword_input,
    widgets.HTML("<h4>Adjust Weights for Criteria (-1: Prefer Low, 0: Neutral, 1: Prefer High):</h4>"),
    widgets.VBox(slider_widgets),
    calculate_button,
    output_area
])

display(ui)

VBox(children=(HTML(value='<h2>City Matchmaker</h2>'), Text(value='', description='Keywords:', layout=Layout(wâ€¦