In [43]:
import pandas as pd
cities = pd.read_csv('static/data/merged_description_city_coded.csv')


In [44]:
from transformers import BertTokenizer, BertModel
import torch
from scipy.spatial.distance import cosine


In [45]:


# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to generate embeddings
def get_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
    # Use the average of the last hidden states as the embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# Example city descriptions and keyword
city_descriptions = cities['merged_description']

# Generate embeddings
city_embeddings = [get_embedding(description) for description in city_descriptions]



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [46]:
city_embeddings_df = pd.DataFrame(city_embeddings)

In [47]:
city_embeddings_df['code'] = cities['code']

In [51]:
city_embeddings_df = city_embeddings_df.set_index('code')

In [52]:
city_embeddings_df.to_csv('static/data/city_embeddings.csv')

In [53]:
city_embeddings = pd.read_csv('static/data/city_embeddings.csv',index_col = 0)


In [31]:
import numpy as np
keyword = "snow ski"
keyword_embedding = get_embedding(keyword)



In [33]:
city_embeddings['similarities'] = city_embeddings.apply(lambda x:1 - cosine(keyword_embedding, x),axis = 1)


In [42]:
city_embeddings.sort_values('similarities',ascending = False)[:25].index.to_list()

[676,
 2462,
 44454,
 286,
 44455,
 5253,
 187,
 463,
 219,
 3545,
 253,
 2064,
 722,
 308,
 223,
 402,
 12530,
 104,
 2407,
 1040,
 460,
 303,
 1721,
 483,
 516]

In [21]:
city_embeddings

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,758,759,760,761,762,763,764,765,766,767
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1104,0.175322,0.437558,0.140124,0.461252,-0.234324,0.032008,0.838617,-0.069208,-0.110414,-0.000841,...,-0.449304,0.003853,-0.025804,-0.234508,-0.130609,-0.303932,0.04001,-0.063186,0.176697,-0.040141
2064,0.055229,0.487108,0.048498,0.368506,-0.272875,0.071283,0.627371,-0.097154,-0.035154,-0.116663,...,-0.234182,0.0491,0.016712,-0.080577,0.043411,-0.169311,0.03134,0.20793,0.274243,0.096757
134,0.040242,0.221386,-0.023149,0.296203,-0.203646,0.046839,0.785888,-0.076233,-0.106592,0.095847,...,-0.270293,-0.040958,-0.047416,-0.080162,0.015577,-0.263088,-0.052037,0.014709,0.146094,-0.122073
483,0.036532,0.367573,0.118084,0.324273,-0.219936,0.032383,0.738275,0.039142,-0.077213,-0.015166,...,-0.353147,0.000537,-0.02402,-0.188291,-0.13892,-0.12091,0.078323,0.127545,0.21479,-0.183081
379,0.062523,0.283632,0.039939,0.329195,-0.213253,-0.052423,0.754945,-0.069369,-0.037502,0.040162,...,-0.297609,-0.099142,-0.023447,-0.094637,-0.115365,-0.352239,0.046675,0.068279,0.173714,0.018277
180,0.011043,0.220019,-0.027051,0.300562,-0.215455,0.053548,0.739994,-0.100688,-0.049847,0.065658,...,-0.18714,0.034781,0.069798,-0.201953,0.072505,-0.378911,-0.040745,0.137571,0.094542,0.12147
4525,0.085926,0.251231,-0.015099,0.637847,-0.290382,0.022901,0.677157,-0.205869,-0.012656,-0.002982,...,-0.35872,-0.02224,-0.174385,-0.09922,-0.085109,-0.409745,0.150262,0.080681,0.056565,-0.150239
1214,-0.011152,0.207852,0.042684,0.345841,-0.177395,-0.020999,0.727459,-0.138272,0.038645,0.113587,...,-0.357348,-0.081264,-0.077566,-0.210174,-0.092284,-0.405696,0.007309,-0.009146,0.114918,0.039606
516,-0.060626,0.31913,0.06324,0.226288,-0.239687,-0.020211,0.687354,-0.050603,-0.05981,0.155143,...,-0.299566,-0.073818,-0.05105,-0.112447,-0.055977,-0.236143,-0.088891,0.175321,0.26584,-0.118655
55,0.10254,0.346064,0.024088,0.34459,-0.18803,-0.104371,0.703368,-0.052482,-0.182726,0.046743,...,-0.348552,0.010344,0.01969,-0.152819,-0.03802,-0.127079,-0.063093,0.086893,0.196722,0.009434


In [7]:
cities = pd.read_csv('city_description.csv')

In [6]:
import pandas as pd

In [12]:
similarities[24]

0.5853464007377625

In [16]:
similarities[5]

0.4178243577480316

In [55]:
city_descriptions

["If you're looking for a getaway that's fun for the whole family and easy on your pocket, look no further than Raleigh. With more than 20 free attractions, this stately and energetic capital city offers an affordable way to enjoy history, culture and the good old outdoors. Spend the day hopping between the three state museums of art, history and natural sciences: all offer free admission. Visitors are free to wander around the grounds of the State Capitol, where they can also observe the legislative process. Stroll past the graves of Civil War generals at a historic cemetery or meander along cobblestone streets of City Market. Shop for the greenest broccoli, reddest apples and tastiest homemade treats around at the State Farmers Market. With over 150 parks, lakes and green ways in Raleigh alone, there is ample opportunity to enjoy golf, biking, swimming, boating and other outdoor adventures.",
 'Providence seamlessly blends the old and the new with historic attractions, museums and th