In [31]:
import os
import pandas as pd
import numpy as np
import heapq

# %pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

N_SPLIT = 10
# ROOT_PATH = '/Users/ruby0322/Projects/112-1/IRTM/term-project/irtm-final-project'



In [41]:
def read_network(input_filepath) -> dict[list[str]]:
    G = {}
    with open(input_filepath) as file:
        while (line := file.readline()):
            if line:
                n1, n2 = line.split()
                if n1 in G:
                    G[n1].append(n2)
                else:
                    G[n1] = [n2]
                if n2 in G:
                    G[n2].append(n1)
                else:
                    G[n2] = [n1]
    return G

def dijkstra_unweighted(G, start):
    """
    Compute shortest paths from the start node to all other nodes in an unweighted graph.

    :param G: A dictionary representing the adjacency list of the graph.
                  Each key is a node, and its value is a list of its neighbors.
    :param start: The starting node
    :return: A dictionary of shortest distances from the start node to each other node.
    """
    # Initialize distances as infinity and distance to start node as 0
    distances = {node: float('infinity') for node in G}
    distances[start] = 0

    # Priority queue to hold nodes and their current distances
    pq = [(0, start)]

    while pq:
        current_distance, current_node = heapq.heappop(pq)

        # Explore neighbors
        for neighbor in G[current_node]:
            distance = current_distance + 1  # Each edge has a weight of 1

            # Update distance if a shorter path is found
            if distance < distances[neighbor]:
                distances[neighbor] = distance
                heapq.heappush(pq, (distance, neighbor))

    return distances

def get_subnetwork(G, mu, k=1):
    distances = dijkstra_unweighted(G, mu)
    return [list(filter(lambda node: distances[node] == dep, distances.keys())) for dep in range(k)]

def split_reviews(input_filepath, output_folder) -> None:
    with open(input_filepath) as file:
        s = file.read()
        s = s.split('\n')
        slen = len(s)
        n_reviews = (slen // N_SPLIT) + 1
        for i in range(N_SPLIT):
            ending = slen if i == N_SPLIT - 1 else n_reviews*(i+1)
            ss = s[n_reviews*i:ending]
            if i == 0:
              ss = ss[1:]
            print(f'[split-reviews] Parsing reviews split {i}...')
            reviews = dict()
            exec('\n'.join(ss), { 'reviews': reviews })
            # print(reviews)
            print(f'[split-reviews] Saving reviews split {i} into "reviews-{i}.csv"...')
            pd.DataFrame(reviews).transpose().dropna().to_csv(f'{output_folder}/reviews-{i}.csv', index=False)

def iterate_over_split_reviews(func):
    """
    func: a function that returns a list of boolean that indexes the reviews
    """
    aggr = pd.DataFrame()
    for i in range(N_SPLIT):
        df = pd.read_csv(f'/content/reviews/reviews-{i}.csv')
        df = df[func(df)]
        aggr = pd.concat([aggr, df])
    return aggr

def get_network_reviews(network: list[str]) -> pd.DataFrame:
    """
    network: list of users in the network
    """
    def f(df: pd.DataFrame):
        return df['user'].isin(network)
    return iterate_over_split_reviews(f)

def get_subnetwork_reviews_by_mu(G, mu, k=1):
    network = get_subnetwork(G, mu, k)
    aggr = pd.DataFrame()
    for dep, subnetwork in enumerate(network):
        df = get_network_reviews(subnetwork)
        df['depth'] = [dep] * df.shape[0]
        aggr = pd.concat([aggr, df])
    return aggr

def split_graph(mu, subgraph, time_threshold):

    def user_time_threshold(mu, subgraph, threshold=0.8): # subgraph type = pd.Dataframe
        user_time_list = sorted(list(subgraph[subgraph['user'] == mu]['unixtime']))
        return user_time_list[int(len(user_time_list)*threshold)]

    def pre_post_split(mu, subgraph, time_threshold=0.8):
        post = subgraph.loc[((subgraph['user'] == mu) & (subgraph['unixtime'] > user_time_threshold(mu, subgraph, time_threshold)))].copy(deep=True)
        pre = subgraph.loc[~subgraph.index.isin(post.index)]
        post.drop('unixtime', inplace=True, axis=1)
        pre.drop('unixtime', inplace=True, axis=1)
        return pre, post

    pre, post = pre_post_split(mu, subgraph)
    def pre_mu_split(mu, pre, post):
        pre_mu = pre.loc[pre['user'] == mu]
        pre_not_mu = pre.loc[~pre.index.isin(pre_mu.index)]

        post = post[post['work'].isin(pre_not_mu['work'])]
        pre_mu.drop('user', inplace=True, axis=1)
        pre_not_mu.drop('user', inplace=True, axis=1)
        post.drop('user', inplace=True, axis=1)
        return pre_mu, pre_not_mu, post

    pre_mu, pre_not_mu, post = pre_mu_split(mu, pre, post)

    ITEM_LIST = list(pre_not_mu['work'].unique())
    ITEM_NUM = len(ITEM_LIST)
    new_post = []

    for i, work in enumerate(ITEM_LIST):
        if(work in list(post['work'])):
            new_post.append(list(post.loc[post['work'] == work].iloc[0].copy(deep=True)))
        else:
            new_post.append([])
            new_post[i].append(work)
            for j in range(len(post.columns) - 1):
                new_post[i].append(0)
    post = pd.DataFrame(np.array(new_post), columns=list(post.columns))

    return pre_mu, pre_not_mu, post

def get_pre_notmu_with_trust(pre_mu, pre_notmu):
    def encode(x):
      return model.encode(x, convert_to_tensor=True)

    pre_mu['comment'] = pre_mu['comment'].apply(encode)
    pre_notmu['comment'] = pre_notmu['comment'].apply(encode)

    def f(cmt):
      s = 0.
      for mu_cmt in pre_mu['comment']:
        s += float(util.pytorch_cos_sim(cmt, mu_cmt)[0][0])
      return s

    pre_notmu['trust'] = pre_notmu['comment'].apply(f)
    pre_notmu['trust'] = pre_notmu['trust']/pre_mu.shape[0]

    return pre_notmu


In [33]:
# !mkdir ~/.kaggle
# !touch ~/.kaggle/kaggle.json

# api_token = {"username":"ruby0322", "key":"e523eb566351e1b4a0a1695201669d6f"}

# import json

# with open('/root/.kaggle/kaggle.json', 'w') as file:
#     json.dump(api_token, file)

# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle datasets download -d pypiahmad/social-recommendation-data

In [34]:
# split_reviews('/content/raw-data/reviews.txt', '/content/reviews')

In [35]:
mu = 'slash'

G = read_network(f'/content/raw-data/edges.txt')
print(len(G[mu]))
# print(get_network_reviews(G['slash'] + ['slash']))
# print(pd.DataFrame(list(get_subnetwork(G, 'carterchristian1', 4).items())))
# print(get_subnetwork(G, 'slash', 2))

15


In [None]:
subgraph = get_subnetwork_reviews_by_mu(G, mu, 2)
subgraph

In [39]:
subgraph

Unnamed: 0,comment,nhelpful,unixtime,work,flags,user,stars,time,depth
63993,Inhaltsangabe:\nDie Dmonen aus der Hlle haben ...,1,1203552000,3550867,[],slash,3.5,"Feb 21, 2008",0
76531,Inhaltsangabe:\nEdward Munrow ist ein Lebemann...,0,1165708800,1750962,[],slash,3.0,"Dec 10, 2006",0
127286,"Erbe des Adelstitels stellt sich dumm, damit s...",0,1168732800,26326,[],slash,4.0,"Jan 14, 2007",0
6597,Inhaltsangabe:\nXeras ist ein Adliger und eine...,0,1220659200,3737595,[],slash,2.5,"Sep 6, 2008",0
48230,Inhaltsangabe:\nBei dem Buch handelt es sich u...,0,1205020800,60309,[],slash,2.5,"Mar 9, 2008",0
84173,Die Geschichte spielt im 17. Jh. und die Haupt...,0,1174694400,2241980,[],slash,4.0,"Mar 24, 2007",0
95253,Inhaltsangabe:\nCouplings fasst die beiden zun...,0,1220400000,7887921,[],slash,3.5,"Sep 3, 2008",0
118464,"Inhaltsangabe:\nChay ist Tierarzt, Werwolf und...",1,1189382400,2044763,[],slash,1.5,"Sep 10, 2007",0
133476,Inhaltsangabe:\nWir befinden uns in der Zeit N...,0,1209427200,2046797,[],slash,5.0,"Apr 29, 2008",0
136607,Zwei russische Soldaten und eine Gruppe deutsc...,0,1163635200,1729337,[],slash,3.0,"Nov 16, 2006",0


In [42]:
pre_mu, pre_notmu, post = split_graph(mu, subgraph, 0.8)
print(pre_mu)
print(pre_notmu)
print(post)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre.drop('unixtime', inplace=True, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_not_mu.drop('user', inplace=True, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  post.drop('user', inplace=True, axis=1)


ValueError: ignored

In [None]:
pre_notmu_with_trust = get_pre_notmu_with_trust(pre_mu, pre_notmu)
print(pre_notmu_with_trust)