# Text Mining

## Load Packages

In [11]:
# Import packages to use later
import pandas as pd
import nltk
import time
import json

## Load & Process Relationship Types

In [12]:
# Why should we set up the relationships data like this?
# 1) Prevents need for stemming each word in each review (time-consuming)
# 2) Allows arbitrary groupings of relationships beyond stemming equivalence
#    (e.g. "roommate" and "housemate", "children" and "kids", etc.)

with open(r"C:/Users/caraca/Downloads/yelp-relationships-geography-main/relationships/Relationships_ATUS_custom_v2.txt") as f:
    relationships = [line.strip().lower() for line in f.readlines()]

relationships_dict = dict()  ## From relationship category to all relevant relationship words, e.g. "spouse" --> ["spouse", "partner"]
relationships_dict_reverse = dict()  ## From any relationship word to its category, e.g. "partner" --> "spouse"

for line in relationships:
    relevant_words = line.split(",")
    category = relevant_words[0]
    relationships_dict[category] = relevant_words.copy()
    for word in relevant_words:
        relationships_dict_reverse[word] = category

full_relationship_set = set()
for relationship_list in relationships_dict.values():
    full_relationship_set.update(relationship_list)
        
print(relationships)
#print()
#print(relationships_dict)
#print()
#print(relationships_dict_reverse)

words_need_our = ["child", "children", "kid", "kids", "son", "sons", "daughter", "daughters"]

full_relationship_set_new = set()
for word in full_relationship_set:
    full_relationship_set_new.add("my_" + word)
    if word in words_need_our:
        full_relationship_set_new.add("our_" + word)
print(full_relationship_set_new)

['child,children,kid,kids', 'daughter,daughters', 'son,sons', 'parent,parents', 'mother,mom', 'father,dad', 'brother,brothers', 'sister,sisters', 'siblings', 'aunt,aunts', 'uncle,uncles', 'niece,nieces', 'nephew,nephews', 'cousin,cousins', 'grandchild,grandchildren', 'grandmother,grandma', 'grandfather,grandpa', 'grandparents', 'spouse', 'partner', 'husband', 'wife', 'bff', 'relationship', 'date', 'boo,bae,sweetheart', 'fiancee,fiance', 'girlfriend,gf', 'boyfriend,bf', 'friend,friends,buddy,buddies,pal,pals', 'housemate,housemates,roommate,roommates,flatmate,flatmates', 'neighbor,neighbors', 'classmate,classmates', 'professor,professors', 'teacher,teachers', 'coworker,coworkers,colleague,colleagues', 'client,clients', 'boss']
{'my_kids', 'our_kid', 'our_kids', 'my_grandparents', 'my_son', 'my_relationship', 'my_roommates', 'my_neighbors', 'my_housemates', 'our_children', 'our_daughters', 'my_husband', 'my_kid', 'my_girlfriend', 'my_pals', 'my_boss', 'my_professor', 'my_brothers', 'my_b

## Text Mining
### Note that for the purposes of this exercise, we're just going to use the Boulder data set. No need to iterate through the metro areas, so we're going to explicitly read in the Boulder csv.

In [13]:


#metros = ['Boston', 'Portland', 'Austin', 'Orlando', 'Atlanta', 'Vancouver', 'Columbus', 'Boulder']

# metro = 'Vancouver'
# metro_reviews = pd.read_csv("small_reviews/yelp_academic_dataset_reviews_" + metro + ".csv")
# metro_reviews = pd.read_csv("small_reviews_urbcomp/yelp_academic_dataset_reviews_" + metro + ".csv")
#metro_reviews.head(3)
metro_reviews = pd.read_csv(r"C:/Users/caraca/Downloads/yelp-relationships-geography-main/small_reviews_urbcomp/yelp_academic_dataset_reviews_Boulder.csv")

In [14]:
t0 = time.time()

# Clean review text by making everything lowercase
metro_reviews["text_clean"] = metro_reviews["text"].str.lower()

#note that the words that cue a relationship are ---my--- & ---our---
metro_reviews["text_clean"] = metro_reviews["text_clean"].str.replace("my ", "my_")
metro_reviews["text_clean"] = metro_reviews["text_clean"].str.replace("our ", "our_")

#print(metro_reviews[metro_reviews["review_id"] == "k9vlSSUStwY2DcjM8Rinnw"].iloc[0, -1])

# Apply tokenizer and get rid of punctuation
tokenizer = nltk.RegexpTokenizer(r"\w+")
metro_reviews["text_clean"] = metro_reviews["text_clean"].fillna("0")
metro_reviews["text_clean"] = metro_reviews["text_clean"].apply(tokenizer.tokenize)

# Remove duplicate words in each review
metro_reviews["text_clean"] = metro_reviews["text_clean"].apply(set)

# Join tokens with spaces into strings for easy word counting
metro_reviews["text_clean"] = metro_reviews["text_clean"].apply(" ".join)

t1 = time.time()
print(t1-t0, "sec")

4.882421255111694 sec


In [15]:
metro_reviews.head(3)

Unnamed: 0.1,Unnamed: 0,funny,useful,review_id,text,business_id,stars,date,user_id,cool,datetime,text_clean
0,4,0,0,sjm_uUcQVxab_EeLCqsYLg,The food is always great here. The service fro...,8zehGz9jnxPqXtOc7KaJxA,4.0,2011-07-28 18:05:01,0kA0PAJ8QFMeveQWHFqz2A,0,2011-07-28 18:05:01,if only can of from you is service back patio ...
1,53,0,0,eiAeuhR3kurAO8rAt_rhlg,Brasserie zero zero... we were hoping for a te...,8zehGz9jnxPqXtOc7KaJxA,2.0,2018-08-04 20:52:32,7zEJt0NVl-lMiMwkCsvteg,0,2018-08-04 20:52:32,of disappointing even size mustard tough start...
2,185,0,0,tanL1f9UuVPE2veDDHAL-w,Excellent subs at a reasonable price. A huge v...,i9BDFBYcl_PGqrLbQUdMvg,4.0,2016-06-22 05:13:56,h_kjYBpubAmYVxvNaID7Lw,0,2016-06-22 05:13:56,can decide of ranch those and reasonable huge ...


In [16]:
t0 = time.time()

relationship_categories = sorted(relationships_dict.keys())
relationship_categories_ungroup = sorted(relationships_dict_reverse.keys())

df_rows = []
df_rows_ungroup = []

for i, business_id in enumerate(metro_reviews.business_id.unique()):
    reviews_subset = metro_reviews[metro_reviews["business_id"] == business_id]
    reviews_subset_counts = reviews_subset.text_clean.str.split().explode().value_counts().reset_index()
    x = reviews_subset_counts[reviews_subset_counts["index"].isin(full_relationship_set_new)]
#     print(x)
#     break
    df_row = [business_id, len(reviews_subset)] + [0 for key in relationship_categories]
    df_row_ungroup = [business_id, len(reviews_subset)] + [0 for key in relationship_categories_ungroup]

    for row in x.itertuples():
        key = row.index.split("_")[-1]
        
        df_row_idx = 2 + relationship_categories.index(relationships_dict_reverse[key])
        df_row[df_row_idx] += row.text_clean
        
        df_row_idx_ungroup = 2 + relationship_categories_ungroup.index(key)
        df_row_ungroup[df_row_idx_ungroup] += row.text_clean
        
    df_rows.append(df_row)
    df_rows_ungroup.append(df_row_ungroup)

relationship_df = pd.DataFrame(df_rows,columns = ["business_id", "num_reviews"] + relationship_categories)
relationship_df['num_relationship_words'] = relationship_df[relationship_categories].sum(axis=1)
relationship_df = relationship_df[["business_id", "num_reviews", "num_relationship_words"] + relationship_categories]

relationship_df_ungroup = pd.DataFrame(df_rows_ungroup,columns = ["business_id", "num_reviews"] + relationship_categories_ungroup)
relationship_df_ungroup['num_relationship_words'] = relationship_df_ungroup[relationship_categories_ungroup].sum(axis=1)
relationship_df_ungroup = relationship_df_ungroup[["business_id", "num_reviews", "num_relationship_words"] + relationship_categories_ungroup]

t1 = time.time()
print(t1-t0, "sec")

18.887014627456665 sec


In [20]:
relationship_df.to_csv(r"C:/Users/caraca/Downloads/yelp-relationships-geography-main/output_urbcomp/699counts_v2.csv", index=False)
relationship_df_ungroup.to_csv(r"C:/Users/caraca/Downloads/yelp-relationships-geography-main/output_urbcomp/699counts_ungrouped_v2.csv", index=False)
# relationship_df.to_csv("output/" + metro + "_counts_v2.csv", index=False)
# relationship_df_ungroup.to_csv("output/" + metro + "_counts_ungrouped_v2.csv", index=False)