In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import dgl
import torch
import scipy.sparse as sp
import torch.nn as nn
from node2vec import Node2Vec
from tqdm import tqdm

## Load in the data

In [23]:
# # Import the data
# df = pd.read_csv('../../RVF_ATX_PID_HZ-2020-07.tsv', sep='\t')
# # filter out certain columns

# dwell_time_threshold = 60 # in minutes
# start_date = pd.to_datetime('2020-07-01')
# end_date = pd.to_datetime('2020-07-06')

# filtered_df = df[(pd.to_datetime(df['utc_date']) >= start_date) & (pd.to_datetime(df['utc_date']) <= end_date) & (df['dwell'] >= dwell_time_threshold)]
# df.head()

# Import the data
df1 = pd.read_csv('../../RVF_ATX_PID_HZ-2020-07.tsv', sep='\t')
df2 = pd.read_csv('../../RVF_ATX_PID_HZ-2020-08.tsv', sep='\t')
merged_df = pd.concat([df1, df2])

# dwell_time_threshold = 60 # in minutes

# filtered_df = merged_df[(merged_df['dwell'] >= dwell_time_threshold)]
filtered_df = merged_df
print(filtered_df.head())
print(filtered_df.shape)

                                        persistentid  \
0  5903d26cdcecbd13590c8fe594de785f19b16004e19156...   
1  cc371dcb888f9ec0ab9a4ecabc4d49e51288313fb17a47...   
2  21e09ed692d56697e3c26b777a53a411cd21bed5527c5b...   
3  e170f93db0ea4930ea2c0d2167feddb1b4fe2b5412d826...   
4  63164c43b459b4260a338979948ead113855ddaafb9d52...   

                    venueid    utc_date  utc_hour  local_date  local_hour  \
0  40b52f80f964a52051001fe3  2020-07-18        22  2020-07-18          17   
1  40b52f80f964a52051001fe3  2020-07-18        22  2020-07-18          17   
2  40b52f80f964a52051001fe3  2020-07-25         0  2020-07-24          19   
3  40b52f80f964a52051001fe3  2020-07-25        18  2020-07-25          13   
4  40b52f80f964a52051001fe3  2020-07-29        18  2020-07-29          13   

   gender           age  full_panel_reweighted_sag_score  dwell  home_zip  
0  Female  Age_45_to_49                       159.851478     60   77060.0  
1  Female  Age_25_to_29                       22

In [24]:
potential_features = ['persistentid', 'gender', 'age', 'full_panel_reweighted_sag_score']

In [25]:
# drop all columns except those in potential_features
filtered_df = filtered_df.drop(columns=[col for col in filtered_df.columns if col not in potential_features])
filtered_df

Unnamed: 0,persistentid,gender,age,full_panel_reweighted_sag_score
0,5903d26cdcecbd13590c8fe594de785f19b16004e19156...,Female,Age_45_to_49,159.851478
1,cc371dcb888f9ec0ab9a4ecabc4d49e51288313fb17a47...,Female,Age_25_to_29,223.827293
2,21e09ed692d56697e3c26b777a53a411cd21bed5527c5b...,Male,Age_35_to_39,110.797268
3,e170f93db0ea4930ea2c0d2167feddb1b4fe2b5412d826...,,,0.000000
4,63164c43b459b4260a338979948ead113855ddaafb9d52...,Male,Age_35_to_39,137.821169
...,...,...,...,...
905420,6fedb72f9e8e492b27771206e8e491a196a63b98947c34...,Male,Age_20_to_24,212.188912
905421,cba1cbf3144811a523adee9dd865838fd6ce8c833d9816...,Female,Age_30_to_34,111.479760
905422,3d6d8566bdfe2a3b2f4a6d17cb6508049e7fddc1376db6...,Female,Age_30_to_34,103.120154
905423,da25791e5c2de9b67100c7ade48709d7f0e274d0f3df36...,Male,Age_35_to_39,63.827754


In [26]:
filtered_df['gender'] = filtered_df['gender'].fillna('Missing')
filtered_df['age'] = filtered_df['age'].fillna('Missing')
filtered_df['full_panel_reweighted_sag_score'] = filtered_df['full_panel_reweighted_sag_score'].fillna(0)

In [27]:
id_to_gender = {}
id_to_age = {}
id_to_scores = {}
devices_set = set(filtered_df['persistentid'])

In [28]:
'004f1ba8a43c7b1196efcaa6267be599cc2713a784e2335cb10a78ac4ab7b1c4' in devices_set

True

In [29]:
for device in tqdm(devices_set):
    device_df = filtered_df[filtered_df['persistentid'] == device]
    age = device_df['age'].iloc[0]
    gender = device_df['gender'].iloc[0]
    scores = device_df['full_panel_reweighted_sag_score'].mean()
    id_to_gender[device] = gender
    id_to_age[device] = age
    id_to_scores[device] = scores
    

100%|██████████| 62369/62369 [1:48:12<00:00,  9.61it/s]  


In [30]:
id_to_gender['004f1ba8a43c7b1196efcaa6267be599cc2713a784e2335cb10a78ac4ab7b1c4']

'Male'

In [31]:
def create_one_hot_encoding_df(id_to_feature: dict, feature: str):
    feature_df = pd.DataFrame.from_dict(id_to_feature, orient='index', columns=[feature])
    feature_df = pd.get_dummies(feature_df, prefix=feature)
    return feature_df

In [32]:
gender_df = create_one_hot_encoding_df(id_to_gender, 'gender')
age_df = create_one_hot_encoding_df(id_to_age, 'age')

In [33]:
feature_map = {}
for device in tqdm(devices_set):
    gender = gender_df[gender_df.index == device].iloc[0].values
    age = age_df[gender_df.index == device].iloc[0].values
    # print(gender)
    feature_map[device] = {}
    feature_map[device]['age'] = age
    feature_map[device]['gender'] = gender
    feature_map[device]['score'] = id_to_scores[device]



100%|██████████| 62369/62369 [18:49<00:00, 55.22it/s]


In [34]:
feature_map['9f7432ad73deb4ab9f30b04222922e25e928429576e33763eb2c4027ee14c0fa']

{'age': array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),
 'gender': array([1, 0, 0], dtype=uint8),
 'score': 103.13220621427679}

In [35]:
import pickle
with open('node_information_data.pkl', 'wb') as fp:
    pickle.dump(feature_map, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file
