# US Attraction EDA

In [54]:
import pandas as pd
import psycopg2
import psycopg2.extras as extras
import math
import openai 
from pgvector.psycopg2 import register_vector
import os

In [14]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [24]:
df = pd.read_csv("../data/cleaned_data_USA.csv", index_col=0)

In [25]:
df.head()

Unnamed: 0,name,main_category,rating,reviews,categories,address,city,country,state,zipcode,broader_category,Weighted_Score,Weighted_Average,All_Cities
0,Forsyth Park,Park,4.8,16538.0,"Park, Tourist attraction","Forsyth Park, Savannah, GA 31401",Savannah,USA,GA,,Nature,79382.4,4.67,"Atlanta, Augusta, Chattanooga, Savannah"
1,The Cathedral Basilica of St. John the Baptist,Catholic cathedral,4.8,5911.0,"Catholic cathedral, Catholic church, Tourist a...",The Cathedral Basilica of St. John the Baptist...,Savannah,USA,GA,,Religious,28372.8,4.8,"Atlanta, Augusta, Chattanooga, Savannah"
2,Fort Pulaski National Monument,Monument,4.8,5221.0,"Monument, Historical place, Historical landmar...","Fort Pulaski National Monument, 101 Fort Pulas...",Savannah,USA,GA,,Cultural,25060.8,4.53,"Atlanta, Augusta, Chattanooga, Savannah"
3,Fountain at Forsyth Park,Historical landmark,4.8,4234.0,"Historical landmark, Tourist attraction","Fountain at Forsyth Park, 1 W Gaston St, Savan...",Savannah,USA,GA,,Cultural,20323.2,4.53,"Atlanta, Augusta, Chattanooga, Savannah"
4,Wormsloe State Historic Site,Historical place museum,4.5,3615.0,"Historical place museum, Museum, Park, State park","Wormsloe State Historic Site, 7601 Skidaway Rd...",Savannah,USA,GA,,Cultural,16267.5,4.53,"Atlanta, Augusta, Chattanooga, Savannah"


In [55]:
conn = psycopg2.connect(
    dbname="postgresdb",
    user="postgres",
    password="postgres_password",
    host="host.docker.internal",  # e.g., "localhost"
    port="5433"        # default PostgreSQL port
)

cursor = conn.cursor()
cursor.execute("SELECT version();")
print(cursor.fetchone())


('PostgreSQL 16.10 (Debian 16.10-1.pgdg12+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14+deb12u1) 12.2.0, 64-bit',)


In [27]:
df = df.reset_index(names=["id"])
df.head()

Unnamed: 0,id,name,main_category,rating,reviews,categories,address,city,country,state,zipcode,broader_category,Weighted_Score,Weighted_Average,All_Cities
0,0,Forsyth Park,Park,4.8,16538.0,"Park, Tourist attraction","Forsyth Park, Savannah, GA 31401",Savannah,USA,GA,,Nature,79382.4,4.67,"Atlanta, Augusta, Chattanooga, Savannah"
1,1,The Cathedral Basilica of St. John the Baptist,Catholic cathedral,4.8,5911.0,"Catholic cathedral, Catholic church, Tourist a...",The Cathedral Basilica of St. John the Baptist...,Savannah,USA,GA,,Religious,28372.8,4.8,"Atlanta, Augusta, Chattanooga, Savannah"
2,2,Fort Pulaski National Monument,Monument,4.8,5221.0,"Monument, Historical place, Historical landmar...","Fort Pulaski National Monument, 101 Fort Pulas...",Savannah,USA,GA,,Cultural,25060.8,4.53,"Atlanta, Augusta, Chattanooga, Savannah"
3,3,Fountain at Forsyth Park,Historical landmark,4.8,4234.0,"Historical landmark, Tourist attraction","Fountain at Forsyth Park, 1 W Gaston St, Savan...",Savannah,USA,GA,,Cultural,20323.2,4.53,"Atlanta, Augusta, Chattanooga, Savannah"
4,4,Wormsloe State Historic Site,Historical place museum,4.5,3615.0,"Historical place museum, Museum, Park, State park","Wormsloe State Historic Site, 7601 Skidaway Rd...",Savannah,USA,GA,,Cultural,16267.5,4.53,"Atlanta, Augusta, Chattanooga, Savannah"


In [28]:
df.dtypes

id                    int64
name                 object
main_category        object
rating              float64
reviews             float64
categories           object
address              object
city                 object
country              object
state                object
zipcode             float64
broader_category     object
Weighted_Score      float64
Weighted_Average    float64
All_Cities           object
dtype: object

# Postgres Upload

In [29]:
# Create table SQL – example with basic types to match DataFrame columns
create_table_query = '''
CREATE TABLE IF NOT EXISTS us_attractions (
    id INTEGER PRIMARY KEY,
    name VARCHAR(250),
    main_category VARCHAR(250),
    rating REAL,
    reviews REAL,
    categories VARCHAR(250),
    address VARCHAR(250),
    city VARCHAR(250),
    country VARCHAR(250),
    state VARCHAR(250),
    zipcode INTEGER,
    broader_category VARCHAR(250),
    weighted_score REAL,
    weighted_average REAL,
    all_cities VARCHAR(250)
);
'''
cursor.execute(create_table_query)
conn.commit()

In [15]:
len(df.columns)

14

In [30]:
def clean_tuple_for_insert(tup):
    return tuple(None if (isinstance(x, float) and math.isnan(x)) else x for x in tup)


def load_values(conn, df, table):
    tuples = [clean_tuple_for_insert(tuple(x)) for x in df.to_numpy()]
    col_names = [s.lower() for s in df.columns]
    cols = ','.join(col_names)
    query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
    cursor = conn.cursor()
    try:
        extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    cursor.close()

In [31]:
load_values(conn, df, 'us_attractions')

In [56]:
## Create vector embeddings of existing records

register_vector(conn)
cur = conn.cursor()

# Fetch records that need embedding
cur.execute("SELECT id, name, categories, address, country, broader_category FROM us_attractions")
rows = cur.fetchall()

for row in rows:
    id = row[0]
    text = ','.join(map(str, row[1:]))

    # Call OpenAI embedding API (example)
    response = openai.embeddings.create(
        input=[text],
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    
    # # Update record with embedding
    cur.execute(
        "UPDATE us_attractions SET embedding = %s WHERE id = %s ",
        (embedding, id)
    )
conn.commit()

In [48]:
row = rows[0]
id = row[0]
text = ','.join(map(str, row[1:]))
# print(text)
# Call OpenAI embedding API (example)
response = openai.embeddings.create(
    input=[text],
    model="text-embedding-3-small"
)
embedding = response.data[0].embedding

In [50]:
len(embedding)

1536

In [57]:
cursor.close()
conn.close()