In [47]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import pandas as pd
from nltk.tokenize import word_tokenize

In [48]:
###########
#FUNCTIONS#
###########

def tokenize_clean(text):
    tokens = word_tokenize(text)
    return [t for t in tokens if t.isalpha()] 

In [49]:
##############
#READ IN DATA#
##############

# Read in comments and posts DataFrames
pdx_posts_df    = pd.read_csv('Portland_posts.csv')
pdx_comments_df = pd.read_csv('Portland_comments.csv')

sd_posts_df     = pd.read_csv('SanDiego_posts.csv')
sd_comments_df  = pd.read_csv('SanDiego_comments.csv')

In [50]:
####################
#COMBINE DATAFRAMES#
####################

# Merge comments and posts DataFrames for each subreddit
pdx_posts_df    = pdx_posts_df[["title"]].rename(columns={"title": "text"})
pdx_comments_df = pdx_comments_df[["body"]].rename(columns={"body": "text"})

sd_posts_df     = sd_posts_df[["title"]].rename(columns={"title": "text"})
sd_comments_df  = sd_comments_df[["body"]].rename(columns={"body": "text"})

# Combine into a single DataFrame
pdx_df = pd.concat([pdx_posts_df, pdx_comments_df], ignore_index=True)
sd_df  = pd.concat([sd_posts_df, sd_comments_df], ignore_index=True)

In [51]:
#####################################
#TEXT PREPROCESSING AND TOKENIZATION#
#####################################

# Lowercase and fill missing values
pdx_df["text"] = pdx_df["text"].fillna("").str.lower()
sd_df["text"]  = sd_df["text"].fillna("").str.lower()

# Tokenize Subreddit Data Frames
pdx_df['tokens'] = pdx_df['text'].apply(tokenize_clean)
sd_df['tokens']  = sd_df['text'].apply(tokenize_clean)

In [None]:
# View Portland text data
pdx_df

Unnamed: 0,text,tokens
0,"next portland is relaunching, with your help","[next, portland, is, relaunching, with, your, ..."
1,gluten free beer,"[gluten, free, beer]"
2,saw your car get hit at south waterfront ohsu ...,"[saw, your, car, get, hit, at, south, waterfro..."
3,"amid housing crunch, portland’s inner eastside...","[amid, housing, crunch, portland, s, inner, ea..."
4,"damn koin, that’s a grim prediction","[damn, koin, that, s, a, grim, prediction]"
...,...,...
1495,you’re the one arguing for the rights of peopl...,"[you, re, the, one, arguing, for, the, rights,..."
1496,hey bro i know you just moved here last year a...,"[hey, bro, i, know, you, just, moved, here, la..."
1497,"true, our tax dollars fund the many handouts t...","[true, our, tax, dollars, fund, the, many, han..."
1498,"""...but it’s also meant to encourage people li...","[but, it, s, also, meant, to, encourage, peopl..."


In [None]:
# View San Diego text data
sd_df

Unnamed: 0,text,tokens
0,"hi, long-term san diego residence here that ar...","[hi, san, diego, residence, here, that, are, p..."
1,where are the best freshly baked pretzels in s...,"[where, are, the, best, freshly, baked, pretze..."
2,seasonal hiring is starting,"[seasonal, hiring, is, starting]"
3,florida street hostage situation currently hap...,"[florida, street, hostage, situation, currentl..."
4,spring 2025 transfer 3.5 gpa,"[spring, transfer, gpa]"
...,...,...
1495,"no permits at all required on goldfinch, not s...","[no, permits, at, all, required, on, goldfinch..."
1496,mostly...,[mostly]
1497,all the og stoners see this laughing cause you...,"[all, the, og, stoners, see, this, laughing, c..."
1498,"hi, welcome/welcome back to the new and return...","[hi, back, to, the, new, and, returning, colle..."
