In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import dgl
import torch
import scipy.sparse as sp
import torch.nn as nn
from node2vec import Node2Vec

## Load in the data

In [30]:
# Import the data
df = pd.read_csv('../../RVF_ATX_PID_HZ-2020-07.tsv', sep='\t')
# filter out certain columns

dwell_time_threshold = 60 # in minutes
start_date = pd.to_datetime('2020-07-01')
end_date = pd.to_datetime('2020-07-01')

filtered_df = df[(pd.to_datetime(df['utc_date']) >= start_date) & (pd.to_datetime(df['utc_date']) <= end_date) & (df['dwell'] >= dwell_time_threshold)]
df.head()

Unnamed: 0,persistentid,venueid,utc_date,utc_hour,local_date,local_hour,gender,age,full_panel_reweighted_sag_score,dwell,home_zip
0,5903d26cdcecbd13590c8fe594de785f19b16004e19156...,40b52f80f964a52051001fe3,2020-07-18,22,2020-07-18,17,Female,Age_45_to_49,159.851478,60,77060.0
1,cc371dcb888f9ec0ab9a4ecabc4d49e51288313fb17a47...,40b52f80f964a52051001fe3,2020-07-18,22,2020-07-18,17,Female,Age_25_to_29,223.827293,1011,75024.0
2,21e09ed692d56697e3c26b777a53a411cd21bed5527c5b...,40b52f80f964a52051001fe3,2020-07-25,0,2020-07-24,19,Male,Age_35_to_39,110.797268,1058,
3,e170f93db0ea4930ea2c0d2167feddb1b4fe2b5412d826...,40b52f80f964a52051001fe3,2020-07-25,18,2020-07-25,13,,,0.0,30,
4,63164c43b459b4260a338979948ead113855ddaafb9d52...,40b52f80f964a52051001fe3,2020-07-29,18,2020-07-29,13,Male,Age_35_to_39,137.821169,763,78660.0


In [31]:
potential_features = ['persistentid', 'gender', 'age', 'full_panel_reweighted_sag_score']

In [32]:
# drop all columns except those in potential_features
filtered_df = filtered_df.drop(columns=[col for col in filtered_df.columns if col not in potential_features])
filtered_df

Unnamed: 0,persistentid,gender,age,full_panel_reweighted_sag_score
24,86be5ba82f0648a3a930d8370ea61c36c43a0382aabf44...,Female,Age_25_to_29,253.239992
122,2adee8895bc53ddebe8fb9c8a532f8bb5105a0d5bdcfd7...,Male,Age_20_to_24,293.604051
135,2a0d818920e252f378d67ce757969d5e1cf669be8e98c0...,Male,Age_60_to_64,247.027412
136,60fb595174175d3c3f2171c9132c622ee08a51b07aa826...,Male,Age_30_to_34,199.120998
202,c8c1994834754ce21ec56e2361214689494b262e2ea6a8...,Male,Age_40_to_44,91.138517
...,...,...,...,...
608044,218bbd1e2fc412f64c8b6bdd425f83e6b6ec79d4f6f66e...,Female,Age_45_to_49,152.666801
608049,d74c211edc4e198abd8478b108560414bd2b553fd99953...,Male,Age_20_to_24,277.014988
608053,6a49f7570eb7d9e96be0409b7a580d65cfdb05f9f9fa79...,Female,Age_25_to_29,228.361197
608074,20f5a15a478d6c98c33036ba5d4715396a14a27bdf1da2...,Female,Age_45_to_49,153.098798


In [33]:
filtered_df['gender'] = filtered_df['gender'].fillna('Missing')
filtered_df['age'] = filtered_df['age'].fillna('Missing')
filtered_df['full_panel_reweighted_sag_score'] = filtered_df['full_panel_reweighted_sag_score'].fillna(0)

In [34]:
id_to_gender = {}
id_to_age = {}
id_to_scores = {}
devices_set = set(filtered_df['persistentid'])

In [35]:
for device in devices_set:
    device_df = filtered_df[filtered_df['persistentid'] == device]
    age = device_df['age'].iloc[0]
    gender = device_df['gender'].iloc[0]
    scores = device_df['full_panel_reweighted_sag_score'].mean()
    id_to_gender[device] = gender
    id_to_age[device] = age
    id_to_scores[device] = scores
    
    

In [36]:
id_to_gender

{'192c19bc54b72be37b29941441d85fca5746dd79666ce893fe6e3224722038fb': 'Male',
 'b70dee51a47d148719db0e4fd19c00b3d97c0bf429745c0803f8416acb3cb164': 'Male',
 '1067bdbcb37b024af421d62bae2e01693d9097c38a79f8df573fefc38b8643d1': 'Missing',
 '171243af111c3fc9d9f7d2d3611eab431ca63fda4b0b3d83d26b6c57954c6ddc': 'Male',
 'b52a3435790a2d2f35875b54919fa21d84f66f7a8896b641ff2a5ef6df6edd60': 'Male',
 '0689c098d155a69e1885c53b16b15fc34cb5cfd2502ab7ce4321bb65ad57d7d1': 'Male',
 'ef42e0ea98b6b78664f7975a2cecd94a33ccc87fc4f633bcc1037c1fa5ba6380': 'Missing',
 'f8617b7065ff0c8b233c9babd17fdf0f8551762b026fbd99236554ff314df3c4': 'Female',
 '895cb33990e4dce6d8eddb36670576d34314ce4459892d0466811b7fa8124254': 'Female',
 '6fc81c705cc211d70130525a25d9c72e0f776a02b32ca704c6af599b2a55c006': 'Male',
 '6395421fc5741ac19ab2c7d1cd2a32ececa020cf59f67fe5e1a2d61fe39c8030': 'Male',
 'a56cb5f0dd7c079aee281a53056de3e9494b26d2a1c688a025ad5fe254afbfd2': 'Missing',
 'cec7880a5dfcbd4dcda1d05bcaa93ce3b2d60d90416bdeeef299243fb058b

In [37]:
def create_one_hot_encoding_df(id_to_feature: dict, feature: str):
    feature_df = pd.DataFrame.from_dict(id_to_feature, orient='index', columns=[feature])
    feature_df = pd.get_dummies(feature_df, prefix=feature)
    return feature_df

In [38]:
gender_df = create_one_hot_encoding_df(id_to_gender, 'gender')
age_df = create_one_hot_encoding_df(id_to_age, 'age')

In [39]:
feature_map = {}
for device in devices_set:
    gender = gender_df[gender_df.index == device].iloc[0].values
    age = age_df[gender_df.index == device].iloc[0].values
    # print(gender)
    feature_map[device] = {}
    feature_map[device]['age'] = age
    feature_map[device]['gender'] = gender
    feature_map[device]['score'] = id_to_scores[device]



In [41]:
feature_map['9f7432ad73deb4ab9f30b04222922e25e928429576e33763eb2c4027ee14c0fa']

{'age': array([False, False, False,  True, False, False, False, False, False,
        False, False, False, False, False, False]),
 'gender': array([ True, False, False]),
 'score': 108.42639941432357}

In [42]:
import pickle
with open('node_information_data.pkl', 'wb') as fp:
    pickle.dump(feature_map, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file
