In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import dgl
import torch
import scipy.sparse as sp
import torch.nn as nn
from node2vec import Node2Vec

## Load in the data

In [17]:
# Import the data
df = pd.read_csv('../../RVF_ATX_PID_HZ-2020-07.tsv', sep='\t')
# filter out certain columns

dwell_time_threshold = 60 # in minutes
start_date = pd.to_datetime('2020-07-01')
end_date = pd.to_datetime('2020-07-06')

filtered_df = df[(pd.to_datetime(df['utc_date']) >= start_date) & (pd.to_datetime(df['utc_date']) <= end_date) & (df['dwell'] >= dwell_time_threshold)]
df.head()

Unnamed: 0,persistentid,venueid,utc_date,utc_hour,local_date,local_hour,gender,age,full_panel_reweighted_sag_score,dwell,home_zip
0,5903d26cdcecbd13590c8fe594de785f19b16004e19156...,40b52f80f964a52051001fe3,2020-07-18,22,2020-07-18,17,Female,Age_45_to_49,159.851478,60,77060.0
1,cc371dcb888f9ec0ab9a4ecabc4d49e51288313fb17a47...,40b52f80f964a52051001fe3,2020-07-18,22,2020-07-18,17,Female,Age_25_to_29,223.827293,1011,75024.0
2,21e09ed692d56697e3c26b777a53a411cd21bed5527c5b...,40b52f80f964a52051001fe3,2020-07-25,0,2020-07-24,19,Male,Age_35_to_39,110.797268,1058,
3,e170f93db0ea4930ea2c0d2167feddb1b4fe2b5412d826...,40b52f80f964a52051001fe3,2020-07-25,18,2020-07-25,13,,,0.0,30,
4,63164c43b459b4260a338979948ead113855ddaafb9d52...,40b52f80f964a52051001fe3,2020-07-29,18,2020-07-29,13,Male,Age_35_to_39,137.821169,763,78660.0


In [18]:
potential_features = ['persistentid', 'gender', 'age', 'full_panel_reweighted_sag_score']

In [19]:
# drop all columns except those in potential_features
filtered_df = filtered_df.drop(columns=[col for col in filtered_df.columns if col not in potential_features])
filtered_df

Unnamed: 0,persistentid,gender,age,full_panel_reweighted_sag_score
5,ad9adea8e7d63428e9372e0d670244e5033d4d2988e554...,Female,Age_30_to_34,144.987245
10,b8ea7697c88dc1f4a1f9b9bbe60d07c304ffc1539ee806...,Male,Age_30_to_34,151.982086
14,d7891bc5540d1352f5149f902922ffa923fe1d27e06be8...,Female,Age_30_to_34,167.096863
17,fccee91660f332ee4b571403661ecc3aed3d29fbc89792...,Female,Age_50_to_54,131.421249
18,ff8ff5da9fbb2a65d408a333fa1359616c899634f246d4...,Female,Age_30_to_34,163.291871
...,...,...,...,...
608114,064695660cc27a6dec49fec9ce78c9a2d3a660e3d7455f...,Male,Age_40_to_44,100.835158
608126,e4689cd484fbbae0557ece20a8b865962f9a3ee82e7c02...,Female,Age_40_to_44,98.168887
608139,7a8daccdf93e1e1ebd632d049d50c246b0482850b5fdf1...,Male,Age_30_to_34,142.966973
608143,126ebcca5cbb1d17bf4dbeb234abb23177a5c9d1088653...,Male,Age_35_to_39,106.696138


In [20]:
filtered_df['gender'] = filtered_df['gender'].fillna('Missing')
filtered_df['age'] = filtered_df['age'].fillna('Missing')
filtered_df['full_panel_reweighted_sag_score'] = filtered_df['full_panel_reweighted_sag_score'].fillna(0)

In [21]:
id_to_gender = {}
id_to_age = {}
id_to_scores = {}
devices_set = set(filtered_df['persistentid'])

In [22]:
'004f1ba8a43c7b1196efcaa6267be599cc2713a784e2335cb10a78ac4ab7b1c4' in devices_set

True

In [23]:
for device in devices_set:
    device_df = filtered_df[filtered_df['persistentid'] == device]
    age = device_df['age'].iloc[0]
    gender = device_df['gender'].iloc[0]
    scores = device_df['full_panel_reweighted_sag_score'].mean()
    id_to_gender[device] = gender
    id_to_age[device] = age
    id_to_scores[device] = scores
    
    

In [24]:
id_to_gender['004f1ba8a43c7b1196efcaa6267be599cc2713a784e2335cb10a78ac4ab7b1c4']

'Male'

In [25]:
def create_one_hot_encoding_df(id_to_feature: dict, feature: str):
    feature_df = pd.DataFrame.from_dict(id_to_feature, orient='index', columns=[feature])
    feature_df = pd.get_dummies(feature_df, prefix=feature)
    return feature_df

In [26]:
gender_df = create_one_hot_encoding_df(id_to_gender, 'gender')
age_df = create_one_hot_encoding_df(id_to_age, 'age')

In [27]:
feature_map = {}
for device in devices_set:
    gender = gender_df[gender_df.index == device].iloc[0].values
    age = age_df[gender_df.index == device].iloc[0].values
    # print(gender)
    feature_map[device] = {}
    feature_map[device]['age'] = age
    feature_map[device]['gender'] = gender
    feature_map[device]['score'] = id_to_scores[device]



In [28]:
feature_map['9f7432ad73deb4ab9f30b04222922e25e928429576e33763eb2c4027ee14c0fa']

{'age': array([False, False, False,  True, False, False, False, False, False,
        False, False, False, False, False, False]),
 'gender': array([ True, False, False]),
 'score': 111.07597379585306}

In [29]:
import pickle
with open('node_information_data.pkl', 'wb') as fp:
    pickle.dump(feature_map, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file
