In [None]:
import pandas as pd
from tqdm.notebook import tqdm
from igraph import Graph, plot, RainbowPalette

In [None]:
def read_chunks(file, cols=None, city=None, chunk_size=500):
    df = pd.read_json(path_or_buf=file, chunksize=chunk_size, lines=True)

    chunk_list = []
    for chunk in tqdm(df, desc=file):
        if city:
            chunk = chunk[chunk['city'] == city]
        if cols is None:
            chunk_list.append(chunk)
        else:
            chunk_list.append(chunk[cols])
    
    return pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

In [None]:
city_of_interest = 'Santa Barbara'

business_data = read_chunks('yelp_dataset/yelp_academic_dataset_business.json', city=city_of_interest)
business_ids = business_data['business_id'].unique()

checkin_data = read_chunks('yelp_dataset/yelp_academic_dataset_checkin.json')
checkin_data = checkin_data[checkin_data['business_id'].isin(business_ids)]

review_data = read_chunks('yelp_dataset/yelp_academic_dataset_review.json')
review_data = review_data[review_data['business_id'].isin(business_ids)]

tip_data = read_chunks('yelp_dataset/yelp_academic_dataset_tip.json')
tip_data = tip_data[tip_data['business_id'].isin(business_ids)]

user_data = read_chunks('yelp_dataset/yelp_academic_dataset_user.json')
users_of_interest = set(review_data['user_id']).union(set(tip_data['user_id']))
user_data = user_data[user_data['user_id'].isin(users_of_interest)]

In [None]:
user_friends = user_data.explode('friends')

# Create nodes DataFrame
nodes_df = user_friends[['user_id']].copy()
nodes_df.rename(columns={'user_id': 'id'}, inplace=True)
nodes_df['media'] = 'user'
nodes_df['media.type'] = 0
nodes_df['type.label'] = 'user'
nodes_df['audience.size'] = 1  # Assuming all users have an audience size of 1

# Create links DataFrame
links_df = user_friends.rename(columns={'user_id': 'from', 'friends': 'to'})
links_df['weight'] = 1
links_df['type'] = 'friendship'

# Optionally, you can drop duplicates from links_df if needed
links_df.drop_duplicates(inplace=True)

In [None]:
user_data['friends'] = user_data['friends'].str.split(', ')
user_friends = user_data.explode('friends')

In [None]:
# Create nodes DataFrame
nodes_df = user_friends[['user_id']].copy()
nodes_df.rename(columns={'user_id': 'id'}, inplace=True)
nodes_df['type.label'] = 'user'
nodes_df['audience.size'] = 1
nodes_df.drop_duplicates(inplace=True)
print(nodes_df)

user_friends = user_friends[['user_id', 'friends']]
links_df = user_friends.rename(columns={'user_id': 'from', 'friends': 'to'})
links_df['weight'] = 1
links_df['type'] = 'friendship'
links_df = links_df[links_df['to'] != 'None']

links_df.drop_duplicates(inplace=True)

# Create graph
g = Graph.DataFrame(links_df, directed=True, use_vids=False, vertices=nodes_df)
plot(g)
