In [1]:
import nltk


In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\harik\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\harik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
import pandas as pd
import numpy as np
import string
import wikipedia
import time
import pickle
import json
import joblib
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz
from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import ipywidgets as widgets
from IPython.display import display, clear_output

In [7]:
CSV_FILE_PATH = 'Cities_data.csv'
OUTPUT_PICKLE_FILE = 'wiki_city_content.pkl'
OUTPUT_JSON_FILE = 'wiki_city_content.json'

In [8]:
df = pd.read_csv(CSV_FILE_PATH)
cities = df['City'].unique().tolist()
print(f"Found {len(cities)} unique city names in {CSV_FILE_PATH}.")
wikipedia.set_lang('en')

Found 50 unique city names in Cities_data.csv.


In [10]:
wiki_content = {}
for city in cities:
        page = wikipedia.page(city, auto_suggest=False, redirect=True)
        wiki_content[city] = page.content
        print(f"Successfully fetched content for {city}.")

Successfully fetched content for Honolulu.
Successfully fetched content for San Francisco.
Successfully fetched content for Dubai.
Successfully fetched content for Los Angeles.
Successfully fetched content for Perth.
Successfully fetched content for Melbourne.
Successfully fetched content for Singapore.
Successfully fetched content for Sydney.
Successfully fetched content for Miami.
Successfully fetched content for Copenhagen.
Successfully fetched content for Lisbon.
Successfully fetched content for Dallas.
Successfully fetched content for Madrid.
Successfully fetched content for Luxembourg.
Successfully fetched content for Geneva.
Successfully fetched content for Frankfurt.
Successfully fetched content for Christchurch.
Successfully fetched content for Amsterdam.
Successfully fetched content for Munich.
Successfully fetched content for Barcelona.
Successfully fetched content for Wellington.
Successfully fetched content for Auckland.
Successfully fetched content for Helsinki.
Successfu

In [11]:
if wiki_content:
    with open(OUTPUT_PICKLE_FILE, 'wb') as f:
        pickle.dump(wiki_content, f)
        print(f"\nSuccessfully saved fetched content to {OUTPUT_PICKLE_FILE}")

    with open(OUTPUT_JSON_FILE, 'w', encoding='utf-8') as f:
        json.dump(wiki_content, f, ensure_ascii=False, indent=4)
        print(f"\nSuccessfully saved fetched content to {OUTPUT_JSON_FILE}")


Successfully saved fetched content to wiki_city_content.pkl

Successfully saved fetched content to wiki_city_content.json


In [5]:
INPUT_PICKLE_FILE = 'wiki_city_content.pkl'
OUTPUT_PROCESSED_PICKLE_FILE = 'wiki_preprocessed.pkl'
OUTPUT_PROCESSED_JSON_FILE = 'wiki_preprocessed.json'

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [7]:
with open(INPUT_PICKLE_FILE, 'rb') as f:
    wiki_content = pickle.load(f)

In [8]:
preprocessed_content = {}
for city, text in wiki_content.items():
    tokens = word_tokenize(text.lower())    
    lemmatized_tokens = []
    for token in tokens:
        if token.isalpha() and token not in stop_words and token not in string.punctuation:
            lemma = lemmatizer.lemmatize(token)
            lemmatized_tokens.append(lemma)
    preprocessed_text = " ".join(lemmatized_tokens)
    preprocessed_content[city] = preprocessed_text
    print(f"  Processed '{city}' ({len(preprocessed_text)} chars)")

  Processed 'Honolulu' (23757 chars)
  Processed 'San Francisco' (64445 chars)
  Processed 'Dubai' (57019 chars)
  Processed 'Los Angeles' (44233 chars)
  Processed 'Perth' (39187 chars)
  Processed 'Melbourne' (47340 chars)
  Processed 'Singapore' (59396 chars)
  Processed 'Sydney' (64380 chars)
  Processed 'Miami' (40438 chars)
  Processed 'Copenhagen' (54130 chars)
  Processed 'Lisbon' (43196 chars)
  Processed 'Dallas' (58581 chars)
  Processed 'Madrid' (59589 chars)
  Processed 'Luxembourg' (35672 chars)
  Processed 'Geneva' (45095 chars)
  Processed 'Frankfurt' (79882 chars)
  Processed 'Christchurch' (45254 chars)
  Processed 'Amsterdam' (57163 chars)
  Processed 'Munich' (53404 chars)
  Processed 'Barcelona' (44856 chars)
  Processed 'Wellington' (41842 chars)
  Processed 'Auckland' (37688 chars)
  Processed 'Helsinki' (41299 chars)
  Processed 'Stockholm' (43206 chars)
  Processed 'Nice' (26793 chars)
  Processed 'Oslo' (38653 chars)
  Processed 'Rome' (54023 chars)
  Processe

In [21]:
with open(OUTPUT_PROCESSED_PICKLE_FILE, 'wb') as f:
    pickle.dump(preprocessed_content, f)
    print(f"\nSuccessfully saved preprocessed content to {OUTPUT_PROCESSED_PICKLE_FILE}")

with open(OUTPUT_PROCESSED_JSON_FILE, 'w', encoding='utf-8') as f:
    json.dump(preprocessed_content, f, ensure_ascii=False, indent=4)
    print(f"Successfully saved preprocessed content to {OUTPUT_PROCESSED_JSON_FILE}")


Successfully saved preprocessed content to wiki_preprocessed.pkl
Successfully saved preprocessed content to wiki_preprocessed.json


In [9]:
INPUT_PROCESSED_PICKLE_FILE = 'wiki_preprocessed.pkl'
OUTPUT_VECTORIZER_FILE = 'tfidf_vectorizer.joblib'
OUTPUT_MATRIX_FILE = 'tfidf_matrix.npz'
OUTPUT_CITY_ORDER_FILE = 'city_order.pkl'

In [10]:
with open(INPUT_PROCESSED_PICKLE_FILE, 'rb') as f:
    preprocessed_content = pickle.load(f)

In [11]:
city_order = df['City'].unique().tolist()
texts_in_order = []
for city in city_order:
    if city in preprocessed_content:
        texts_in_order.append(preprocessed_content[city])

In [12]:
vectorizer = TfidfVectorizer(max_df=1.0, min_df=3, ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(texts_in_order)
print("TF-IDF matrix created successfully.")
print(f"Shape of TF-IDF matrix: {tfidf_matrix.shape}")

TF-IDF matrix created successfully.
Shape of TF-IDF matrix: (50, 17846)


In [27]:
joblib.dump(vectorizer, OUTPUT_VECTORIZER_FILE)
print(f"Vectorizer saved to {OUTPUT_VECTORIZER_FILE}")
save_npz(OUTPUT_MATRIX_FILE, tfidf_matrix)
print(f"TF-IDF matrix saved to {OUTPUT_MATRIX_FILE}")
with open(OUTPUT_CITY_ORDER_FILE, 'wb') as f:
    pickle.dump(city_order, f)
print(f"City order saved to {OUTPUT_CITY_ORDER_FILE}")

Vectorizer saved to tfidf_vectorizer.joblib
TF-IDF matrix saved to tfidf_matrix.npz
City order saved to city_order.pkl


In [13]:
VECTORIZER_FILE = 'tfidf_vectorizer.joblib'
MATRIX_FILE = 'tfidf_matrix.npz'
CITY_ORDER_FILE = 'city_order.pkl'

In [14]:
def find_similar_cities(user_query):

    if not user_query:
        print("Error: No keywords entered.")
        return
    
    text = user_query
    tokens = word_tokenize(text.lower())    
    lemmatized_tokens = []
    for token in tokens:
        if token.isalpha() and token not in stop_words and token not in string.punctuation:
            lemma = lemmatizer.lemmatize(token)
            lemmatized_tokens.append(lemma)
    processed_query = " ".join(lemmatized_tokens)

    if processed_query:
        print(f"Processed query: '{processed_query}'")
    
        query_vector = vectorizer.transform([processed_query])
        print(f"\nQuery transformed to vector shape: {query_vector.shape}")
    
        cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
        city_scores = list(zip(city_order, cosine_similarities))
        ranked_cities = sorted(city_scores, key=lambda item: item[1], reverse=True)
    
        print("\nRanking complete.")
        return ranked_cities

    print("Query does not have any significant words.")

In [15]:
user_keywords = input("Enter keywords describing your ideal city (e.g., beaches nightlife history tech): ")
results = find_similar_cities(user_keywords)
if results:
    print("\n--- Top Matching Cities ---")
    for i, (city, score) in enumerate(results[:10]):
        print(f"{i+1}. {city} (Similarity Score: {score:.4f})")

Enter keywords describing your ideal city (e.g., beaches nightlife history tech):  


Error: No keywords entered.
