# Import Modules

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import dgl
import torch
import scipy.sparse as sp
import torch.nn as nn
from node2vec import Node2Vec
from tqdm import tqdm

In [2]:
import pickle

node_travel_data = None
node_information_data = None

# Read dictionary pkl file
with open('../feature_selection_method1/node_travel_data.pkl', 'rb') as fp:
    node_travel_data = pickle.load(fp)
with open('../feature_selection_method2/node_information_data.pkl', 'rb') as fp:
    node_information_data = pickle.load(fp)
    

In [3]:
print(node_travel_data['004f1ba8a43c7b1196efcaa6267be599cc2713a784e2335cb10a78ac4ab7b1c4'])
print(node_information_data['004f1ba8a43c7b1196efcaa6267be599cc2713a784e2335cb10a78ac4ab7b1c4'])

{'avg_locations_per_day': 1.1774193548387097, 'avg_distance_per_day': 8.63456106653452}
{'age': array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8), 'gender': array([0, 1, 0], dtype=uint8), 'score': 117.70510586611638}


In [5]:
final_features = {}
for key in tqdm(node_travel_data.keys()):
    feature = {}
    avg_locations_per_day = node_travel_data[key]['avg_locations_per_day']
    avg_distance_per_day = node_travel_data[key]['avg_distance_per_day']
    score = node_information_data[key]['score']
    female = 1 if (node_information_data[key]['gender'][0] == True or node_information_data[key]['gender'][0] == 1) else 0
    male = 1 if (node_information_data[key]['gender'][1] == True or node_information_data[key]['gender'][1] == 1) else 0
    age = np.where(node_information_data[key]['age'] == True)[0][0]
    
    feature['avg_locations_per_day'] = avg_locations_per_day
    feature['avg_distance_per_day'] = avg_distance_per_day
    feature['age'] = age
    feature['score'] = score
    feature['female'] = female
    feature['male'] = male

    final_features[key] = feature


100%|██████████| 62369/62369 [00:00<00:00, 119643.12it/s]


In [6]:
final_features['fc488e5949734a3198d955cf913324df695912a48a45c17d8d9da089e6c1a6f2']

{'avg_locations_per_day': 0.14516129032258066,
 'avg_distance_per_day': 0.2424202495972863,
 'age': 6,
 'score': 147.5079478399443,
 'female': 0,
 'male': 1}

In [7]:
# normalize the values for avg_locations_per_day, avg_distance_per_day, score
avg_locations_per_day_list = []
avg_distance_per_day_list = []
score_list = []
keys = list(final_features.keys())
for key in keys:
    avg_locations_per_day_list.append(final_features[key]['avg_locations_per_day'])
    avg_distance_per_day_list.append(final_features[key]['avg_distance_per_day'])
    score_list.append(final_features[key]['score'])

avg_locations_per_day_list = np.array(avg_locations_per_day_list)
avg_distance_per_day_list = np.array(avg_distance_per_day_list)
score_list = np.array(score_list)

# normalize the values. Ref: https://stats.stackexchange.com/questions/178626/how-to-normalize-data-between-1-and-1
# goal : range [-1, 1]
avg_locations_per_day_list_norm = (2 * (avg_locations_per_day_list - min(avg_locations_per_day_list)) / ( max(avg_locations_per_day_list) - min(avg_locations_per_day_list) )) - 1
avg_distance_per_day_list_norm = (2 * (avg_distance_per_day_list - min(avg_distance_per_day_list)) / ( max(avg_distance_per_day_list) - min(avg_distance_per_day_list) )) - 1
score_list_norm = (2 * (score_list - min(score_list)) / ( max(score_list) - min(score_list) )) - 1

for idx, key in enumerate(keys):
    final_features[key]['avg_locations_per_day'] = avg_locations_per_day_list_norm[idx]
    final_features[key]['avg_distance_per_day'] = avg_distance_per_day_list_norm[idx]
    final_features[key]['score'] = score_list_norm[idx]

In [8]:
final_features['fc488e5949734a3198d955cf913324df695912a48a45c17d8d9da089e6c1a6f2']

{'avg_locations_per_day': -0.9911209766925638,
 'avg_distance_per_day': -0.9973329488074967,
 'age': 6,
 'score': -0.9000446211268642,
 'female': 0,
 'male': 1}

In [9]:
import pickle
with open('final_node_features.pkl', 'wb') as fp:
    pickle.dump(final_features, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file
