In [6]:
import os
os.chdir('..')

In [31]:
 
import pandas as pd
edge_df = pd.read_csv('./data/movie_lens/edges.txt', header=None, usecols=[1, 2, 3])
# edge_df[['user', 'movie']
#             ] = edge_df[0].str.split(';', expand=True)

# edge_df = edge_df \
#     .drop(columns=[0]) \
#     .rename(columns={1: 'weight'}) \
#     .assign(user=lambda x: x['user'].apply(float).apply(int)) \
#     .assign(movie=lambda x: x['movie'].apply(float).apply(int)) \
#     .assign(weight=lambda x: x['weight'].apply(float))

In [32]:
edge_df

Unnamed: 0,1,2,3
0,98,2019,Action|Drama
1,98,1276,Comedy|Drama
2,98,3429,Animation|Comedy
3,98,2918,Comedy
4,98,1208,Drama|War
...,...,...,...
18693,5465,398,Documentary
18694,5465,582,Comedy
18695,5465,2214,Thriller
18696,5465,651,Comedy


In [36]:
pd.read_csv('./data/movie_lens/ratings.txt')

Unnamed: 0,4,3468,5,978294008
0,4,1210,3,978293924
1,4,2951,4,978294282
2,4,1214,4,978294260
3,4,1036,4,978294282
4,4,260,5,978294199
...,...,...,...,...
4131,6038,1387,2,956707005
4132,6038,2700,1,956715051
4133,6038,2716,3,956707604
4134,6038,3396,3,956706827


In [7]:

import numpy as np
import networkx as nx

# gMission files
G_MISSION_FILES = {
    "edges": "edges.txt",
    "tasks": "tasks.txt",
    "reduced_tasks": "reduced_tasks.txt",
    "reduced_workers": "reduced_workers.txt"
}

# Movie lens files
MOVIE_LENS_FILES = {
    "movies": "movies.txt",
    "users": "users.txt",
    "edges": "edges.txt",
    "ratings": "ratings.txt",
    "feature_weights": "feature_weights.txt"
}

gMission_folder = "data/g_mission/"
gMission_edges = gMission_folder + "edges.txt"
gMission_tasks = gMission_folder + "tasks.txt"
gMission_reduced_tasks = gMission_folder + "reduced_tasks.txt"
gMission_reduced_workers = gMission_folder + "reduced_workers.txt"

# MovieLense files
movie_lens_folder = "data/movie_lens/"
movie_lense_movies = movie_lens_folder + "movies.txt"
movie_lense_users = movie_lens_folder + "users.txt"
movie_lense_edges = movie_lens_folder + "edges.txt"
movie_lense_ratings = movie_lens_folder + "ratings.txt"
movie_lense_feature_weights = movie_lens_folder + "feature_weights.txt"

def parse_movie_lense_dataset():
    f_edges = open(movie_lense_edges, "r")
    f_movies = open(movie_lense_movies, "r")
    f_users = open(movie_lense_users, "r")
    f_feature_weights = open(movie_lense_feature_weights, "r")
    num_genres = 15
    gender_map = {"M": 0, "F": 1}
    age_map = {"1": 0, "18": 1, "25": 2, "35": 3, "45": 4, "50": 5, "56": 6}
    genre_map = {
        "Action": 0,
        "Adventure": 1,
        "Animation": 2,
        "Children's": 3,
        "Comedy": 4,
        "Crime": 5,
        "Documentary": 6,
        "Drama": 7,
        "Film-Noir": 8,
        "Horror": 9,
        "Musical": 10,
        "Romance": 11,
        "Sci-Fi": 12,
        "Thriller": 13,
        "War": 14,
    }
    users = {}
    movies = {}
    edges = {}
    feature_weights = {}
    user_ids = []
    popularity = {}
    for u in f_users:
        info = u.split(",")[:4]
        info[1] = float(gender_map[info[1]])
        info[2] = float(age_map[info[2]]) / 6.0
        info[3] = float(info[3]) / 21.0
        users[info[0]] = info[1:4]
        user_ids.append(int(info[0]))
    user_ids.sort()
    for i, u in enumerate(user_ids):
        users[str(u)].append(i)

    for m in f_movies:
        info = m.split("::")
        genres = info[2].split("|")
        genres[-1] = genres[-1].split("\n")[0]  # remove "\n" character
        genres_id = np.array(list(map(lambda g: genre_map[g], genres)))
        one_hot_encoding = np.zeros(num_genres)
        one_hot_encoding[genres_id] = 1.0
        movies[info[0]] = list(one_hot_encoding)
        popularity[info[0]] = 0

    for e in f_edges:
        info = e.split(",")
        genres = info[3].split("|")
        genres[-1] = genres[-1].split("\n")[0]  # remove "\n" character
        edges[(info[2], info[1])] = list(map(lambda g: genre_map[g], genres))
        popularity[info[2]] += 1

    for w in f_feature_weights:
        feature = w.split(",")
        feature[-1] = feature[-1].split("\n")[0]  # remove "\n" character
        if feature[1] not in feature_weights:
            feature_weights[feature[1]] = [0.0] * num_genres
        feature_weights[feature[1]][genre_map[feature[0]]] = float(feature[2]) / 5.0
    return users, movies, edges, feature_weights, popularity




def generate_movie_lense_graph(
    u, v, users, edges, movies, sampled_movies, weight_features, seed, vary_fixed=False
):
    np.random.seed(seed)
    G = nx.Graph()
    G = add_nodes_with_bipartite_label(G, u, v)

    G.name = f"movielense_random_graph({u},{v})"

    movies_id = np.array(list(movies.keys())).flatten()
    users_id = np.array(list(users.keys())).flatten()

    if vary_fixed:
        sampled_movies = list(np.random.choice(movies_id, size=u, replace=False))
    movies_features = list(map(lambda m: movies[m], sampled_movies))

    users_features = []
    user_freq_dic = {}  # {v_id: freq}, used for the IPsolver
    sampled_users_dic = {}  # {user_id: v_id}
    # edge_vector_dic = {u: movies_features[u] for u in range(len(sampled_movies))}

    for i in range(v):
        # construct the graph
        j = 0
        while j == 0:
            sampled_user = np.random.choice(users_id)
            user_info = list(weight_features[sampled_user]) + users[sampled_user]
            for w in range(len(sampled_movies)):
                movie = sampled_movies[w]
                edge = (movie, sampled_user)
                if edge in edges and (w, i + u) not in G.edges:
                    G.add_edge(w, i + u)
                    j += 1

        # collect data for the IP solver
        if sampled_user in sampled_users_dic:
            k = sampled_users_dic[sampled_user]
            user_freq_dic[k].append(i)
        else:
            sampled_users_dic[sampled_user] = i
            user_freq_dic[i] = [i]

        # append user features for the model
        users_features.append(user_info)

    # print('r_v: ', user_freq_dic)
    # print('movies_features: ', movies_features)
    # construct the preference matrix, used by the IP solver
    # print("G: \n", nx.adjacency_matrix(G).todense())
    preference_matrix = np.zeros(
        (len(sampled_users_dic), 15)
    )  # 15 is the number of genres
    # print('sampled_users_dic: ', sampled_users_dic)
    adjacency_matrix = np.ndarray((len(sampled_users_dic), u))
    i = 0
    graph = nx.adjacency_matrix(G).todense()
    for user_id in sampled_users_dic:
        preference_matrix[i] = weight_features[user_id]
        v_id = sampled_users_dic[user_id]
        # print('v_id: ', v_id)
        adjacency_matrix[i] = graph[u + v_id, :u]
        i += 1

    # user_freq = list(map(lambda id: user_freq_dic[id], user_freq_dic)) + [0] * (v - (len(user_freq_dic)))
    # print('adj_matrix: \n', adjacency_matrix)
    return (
        G,
        np.array(movies_features),
        np.array(users_features),
        adjacency_matrix,
        user_freq_dic,
        movies_features,
        preference_matrix,
    )


In [33]:
sers, movies, edges, feature_weights, popularity = parse_movie_lense_dataset()

In [34]:
edges

{('2019', '98'): [0, 7],
 ('1276', '98'): [4, 7],
 ('3429', '98'): [2, 4],
 ('2918', '98'): [4],
 ('1208', '98'): [7, 14],
 ('1358', '98'): [7, 13],
 ('1303', '98'): [1],
 ('1131', '98'): [7],
 ('1362', '98'): [7],
 ('2940', '98'): [8],
 ('1419', '98'): [7],
 ('1414', '98'): [4],
 ('1960', '98'): [7, 14],
 ('180', '98'): [4],
 ('2594', '98'): [7, 11, 12],
 ('2565', '98'): [10],
 ('1206', '98'): [12],
 ('36', '98'): [7],
 ('2318', '98'): [4],
 ('608', '98'): [5, 7, 13],
 ('2236', '98'): [7],
 ('2970', '98'): [1, 7],
 ('778', '98'): [7],
 ('538', '98'): [7],
 ('523', '98'): [7],
 ('848', '98'): [7],
 ('587', '98'): [4, 11, 13],
 ('1127', '98'): [0, 1, 12, 13],
 ('2863', '98'): [4, 10],
 ('635', '98'): [4, 7],
 ('3643', '98'): [0, 7, 14],
 ('2344', '98'): [0, 1, 7, 13],
 ('3270', '98'): [7],
 ('124', '98'): [7],
 ('3197', '98'): [0],
 ('68', '98'): [4, 11],
 ('170', '98'): [0, 5, 13],
 ('1811', '98'): [7],
 ('251', '98'): [0],
 ('1835', '98'): [11],
 ('3221', '98'): [7],
 ('252', '98'): [

In [35]:
popularity

{'36': 194,
 '68': 200,
 '124': 200,
 '170': 200,
 '180': 200,
 '192': 200,
 '251': 200,
 '252': 200,
 '273': 199,
 '398': 200,
 '404': 200,
 '410': 199,
 '523': 200,
 '538': 199,
 '550': 200,
 '582': 200,
 '587': 200,
 '608': 184,
 '635': 200,
 '651': 200,
 '657': 200,
 '725': 200,
 '757': 200,
 '778': 199,
 '829': 200,
 '843': 200,
 '848': 200,
 '852': 197,
 '1127': 187,
 '1128': 200,
 '1131': 199,
 '1206': 200,
 '1208': 197,
 '1276': 197,
 '1303': 199,
 '1358': 197,
 '1362': 199,
 '1414': 200,
 '1419': 199,
 '1434': 200,
 '1436': 200,
 '1487': 199,
 '1489': 200,
 '1567': 200,
 '1582': 200,
 '1622': 200,
 '1681': 199,
 '1780': 200,
 '1801': 200,
 '1811': 200,
 '1829': 200,
 '1835': 200,
 '1883': 200,
 '1911': 193,
 '1960': 199,
 '1992': 200,
 '2011': 199,
 '2019': 197,
 '2035': 200,
 '2042': 200,
 '2214': 200,
 '2236': 199,
 '2318': 200,
 '2344': 199,
 '2415': 200,
 '2448': 200,
 '2473': 199,
 '2565': 200,
 '2594': 200,
 '2833': 200,
 '2850': 200,
 '2863': 199,
 '2885': 200,
 '2906':