In [1]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize

In [2]:
###########
#FUNCTIONS#
###########

def tokenize_clean(text):
    tokens = word_tokenize(text)
    return [t for t in tokens if t.isalpha()] 

In [3]:
##############
#READ IN DATA#
##############

# Read in comments and posts DataFrames
pdx_posts_df    = pd.read_csv('Portland_posts.csv')
pdx_comments_df = pd.read_csv('Portland_comments.csv')

sd_posts_df     = pd.read_csv('SanDiego_posts.csv')
sd_comments_df  = pd.read_csv('SanDiego_comments.csv')

In [12]:
# Portland Communities

portland_metro = [
    # Selected Portland neighborhoods
    "Arbor Lodge", "Overlook", "St. Johns", "Kenton",
    "Beaumont-Wilshire", "Hollywood", "Rose City Park", "Parkrose",
    "Hazelwood", "Woodland Park", "Northwest District", "Pearl",
    "Nob Hill", "Slabtown", "Hawthorne", "Brentwood-Darlington",
    "Sellwood-Moreland", "Mt. Tabor", "Lents", "Southwest Hills",
    "Marshall Park", "Collins View", "Far Southwest", "Goose Hollow",
    "Hillside", "Arlington Heights", "Arnold Creek", "Ashcreek",
    "Crestwood", "Multnomah", "Mt. Scott-Arleta", "Lair Hill",
    "Creston-Kenilworth", "Buckman", "Irvington", "Kerns",
    "Sabin", "Lloyd District",

    # Incorporated cities in Multnomah, Washington, Clackamas County, OR
    "Portland", "Gresham", "Fairview", "Troutdale", "Wood Village",
    "Beaverton", "Hillsboro", "Tigard", "Tualatin", "Sherwood",
    "King City", "Cornelius", "Forest Grove", "North Plains",
    "Lake Oswego", "West Linn", "Oregon City", "Gladstone",
    "Milwaukie", "Happy Valley", "Wilsonville", "Canby",
    "Sandy", "Estacada", "Johnson City", "Damascus",

    # Incorporated cities in Clark County, WA
    "Vancouver", "Battle Ground", "Camas", "La Center",
    "Ridgefield", "Washougal", "Yacolt"
]

In [None]:
# San Diego Communities

san_diego_area = [
    # San Diego neighborhoods / communities
    "Allied Gardens", "Alta Vista", "Balboa Park", "Bankers Hill", "Barrio Logan",
    "Bay Ho", "Bay Park", "Bay Terraces", "Birdland", "Black Mountain Ranch",
    "Broadway Heights", "Carmel Mountain Ranch", "Carmel Valley", "Chollas View",
    "City Heights", "Colina del Sol", "Corridor", "Encanto", "Fairmount Park",
    "Gaslamp Quarter", "Golden Hill", "Hillcrest", "Kensington", "La Jolla",
    "La Jolla Shores", "La Jolla Village", "La Jolla Mesa", "La Jolla Heights",
    "Linda Vista", "Little Italy", "Mission Hills", "Mission Valley", "Normal Heights",
    "North Park", "Old Town", "Point Loma", "Pacific Beach", "Mission Beach",
    "Ocean Beach", "Serra Mesa", "San Carlos", "Tierrasanta", "University Heights",
    "East Village", "Downtown", "Marston Hills", "Middletown", "Shelter Island",
    "Sunset Cliffs", "Hillcrest", "El Cerrito", "Mira Mesa",
    
    # Cities / towns / unincorporated communities in San Diego County
    "San Diego", "Chula Vista", "Coronado", "Del Mar", "El Cajon", "Encinitas",
    "Escondido", "Carlsbad", "National City", "Vista", "San Marcos", "Imperial Beach",
    "La Mesa", "Poway", "Oceanside", "Santee", "Solana Beach", "Bonita",
    "Alpine", "Bonsall", "Borrego Springs", "Boulevard", "Campo", "Casa de Oro-Mount Helix",
    "Camp Pendleton South", "Crest", "De Luz", "Descanso", "Dulzura", "Fallbrook",
    "Jamul", "Julian", "Lakeside", "Ramona", "Rancho Santa Fe", "San Ysidro",
    "Spring Valley", "Tecate", "Winter Gardens", "Unincorporated San Diego County"
]

In [4]:
####################
#COMBINE DATAFRAMES#
####################

# Merge comments and posts DataFrames for each subreddit
pdx_posts_df    = pdx_posts_df[["title"]].rename(columns={"title": "text"})
pdx_comments_df = pdx_comments_df[["body"]].rename(columns={"body": "text"})

sd_posts_df     = sd_posts_df[["title"]].rename(columns={"title": "text"})
sd_comments_df  = sd_comments_df[["body"]].rename(columns={"body": "text"})

# Combine into a single DataFrame
pdx_df = pd.concat([pdx_posts_df, pdx_comments_df], ignore_index=True)
sd_df  = pd.concat([sd_posts_df, sd_comments_df], ignore_index=True)

In [5]:
#####################################
#TEXT PREPROCESSING AND TOKENIZATION#
#####################################

# Lowercase and fill missing values
pdx_df["text"] = pdx_df["text"].fillna("").str.lower()
sd_df["text"]  = sd_df["text"].fillna("").str.lower()

# Tokenize Subreddit Data Frames
pdx_df['tokens'] = pdx_df['text'].apply(tokenize_clean)
sd_df['tokens']  = sd_df['text'].apply(tokenize_clean)

In [6]:
# View Portland text data
pdx_df

Unnamed: 0,text,tokens
0,"next portland is relaunching, with your help","[next, portland, is, relaunching, with, your, ..."
1,gluten free beer,"[gluten, free, beer]"
2,saw your car get hit at south waterfront ohsu ...,"[saw, your, car, get, hit, at, south, waterfro..."
3,"amid housing crunch, portland’s inner eastside...","[amid, housing, crunch, portland, s, inner, ea..."
4,"damn koin, that’s a grim prediction","[damn, koin, that, s, a, grim, prediction]"
...,...,...
10252,make the cops annoyed by ice too. there has be...,"[make, the, cops, annoyed, by, ice, too, there..."
10253,certainly not without consequence 😅,"[certainly, not, without, consequence]"
10254,about 35 percent.,"[about, percent]"
10255,the cessna is ppb.,"[the, cessna, is, ppb]"


In [7]:
# View San Diego text data
sd_df

Unnamed: 0,text,tokens
0,"hi, long-term san diego residence here that ar...","[hi, san, diego, residence, here, that, are, p..."
1,where are the best freshly baked pretzels in s...,"[where, are, the, best, freshly, baked, pretze..."
2,seasonal hiring is starting,"[seasonal, hiring, is, starting]"
3,florida street hostage situation currently hap...,"[florida, street, hostage, situation, currentl..."
4,spring 2025 transfer 3.5 gpa,"[spring, transfer, gpa]"
...,...,...
12934,you’re nuts.\n\nvons cost double what ralphs c...,"[you, re, nuts, vons, cost, double, what, ralp..."
12935,"aldi, walmart, costco \n\neven if you don’t bu...","[aldi, walmart, costco, even, if, you, don, t,..."
12936,aldi always,"[aldi, always]"
12937,i don't know what kind of stuff you're shoppin...,"[i, do, know, what, kind, of, stuff, you, shop..."


In [8]:
#######
#MODEL#
#######

pdx_sentences = pdx_df['tokens'].tolist()
sd_sentences  = sd_df['tokens'].tolist()

# Portland Model
pdx_model = Word2Vec(
    pdx_sentences, 
    vector_size=100, 
    window=5, 
    min_count=5, 
    workers=4, 
    sg=1)

pdx_model.save("portland_word2vec.model")

# San Diego Model
sd_model = Word2Vec(
    sd_sentences, 
    vector_size=100, 
    window=5, 
    min_count=5, 
    workers=4, 
    sg=1)
sd_model.save("sandiego_word2vec.model")

In [9]:
word = "downtown"

# Portland
print("Portland:", pdx_model.wv.most_similar(word, topn=10))

# San Diego
print("San Diego:", sd_model.wv.most_similar(word, topn=10))

Portland: [('south', 0.9343104958534241), ('waterfront', 0.9287877678871155), ('flying', 0.9147329926490784), ('next', 0.9033794403076172), ('late', 0.9033511281013489), ('north', 0.8933099508285522), ('flew', 0.8929670453071594), ('circling', 0.8874107599258423), ('neighborhood', 0.8866150379180908), ('hours', 0.8856825232505798)]
San Diego: [('coast', 0.9271609783172607), ('east', 0.9150948524475098), ('gaslamp', 0.914539635181427), ('escondido', 0.9133751392364502), ('idli', 0.9108096957206726), ('convoy', 0.9099150896072388), ('university', 0.9088196158409119), ('west', 0.9067992568016052), ('international', 0.9035874009132385), ('hills', 0.9014480113983154)]
