In [1]:
import os 
import pandas as pd 
import torch 
import torch.nn as nn 
import numpy as np 
import networkx as nx 
from torch_geometric.data import InMemoryDataset, Data
# from prep_mhealth import prep_mhealth
# from prep_wisdm import prep_wisdm
from torch.nn import Linear 
import torch.optim as optim 
from torch_geometric.nn import GCNConv
import time
import tqdm 
import random
import copy
from torch_geometric.data import DataLoader
# from model_utils import * 
import datetime as dttm 
import argparse
from mlflow import log_metric, log_param, log_artifacts
import mlflow 

In [2]:
os.getcwd()
os.chdir('../scripts')
from prep_mhealth import prep_mhealth
from prep_wisdm import prep_wisdm
from model_utils import * 
os.chdir('../')

In [4]:
os.getcwd()

'c:\\Users\\abhi\\Documents\\GEEK\\GNN\\ours'

In [3]:
import pandas as pd 
import numpy as np 
import os 
import networkx as nx 
import matplotlib.colors as mcolors
import random 
import scipy.spatial as sp 
import tqdm 

# datadir = 'data/WISDM_ar_v1.1'

activity_map={}
activity_map[1]='Walking'
activity_map[2]='Jogging'
activity_map[3]='Upstairs'
activity_map[4]='Downstairs'
activity_map[5]='Sitting'
activity_map[6]='Standing'

activity_map={}
activity_map['Walking']=1
activity_map['Jogging']=2 
activity_map['Upstairs']= 3
activity_map['Downstairs']= 4
activity_map['Sitting']= 5
activity_map['Standing'] = 6
  

def add_encoded_activity(filename, datadir, sep = "\t"):
    """given raw user data 
    add the encoded activity column
    """
    user_data = pd.read_csv(os.path.join(datadir, filename), 
                            sep = sep)
    # print(user_data.shape)
    colnames= ['user_id', 'activity', 'timestamp'] + ['feature_{}'.format(i) for i in range(1, 4)] 
    user_data.columns = colnames
    user_data['encoded_activity'] =  user_data['activity'].map(activity_map)
    # user_data['user_id'] = filename.split('_')[1].split('.')[0][7:]
    user_data = user_data[['user_id', 'encoded_activity', 'feature_1', 'feature_2', 'feature_3']]

    return user_data

def average_slice(df_, NUM_SAMPLE = 128):
    """prepare time slices and 
    average over each time slice. 
    """
    out = []
    num_groups = df_.shape[0] // NUM_SAMPLE
    for i in range(0, df_.shape[0], NUM_SAMPLE): 
        idx = (i , min(df_.shape[0], i + NUM_SAMPLE))    
        tmp = df_.iloc[idx[0]:idx[1], :]
        averaged = pd.DataFrame(tmp.iloc[:, -3:].apply(np.mean)).T
        out.append(pd.concat([averaged, tmp.iloc[:1, :-3].reset_index(drop = True)], axis = 1))
    out = pd.concat(out)
    out['encoded_activity'] = out['encoded_activity'].apply(int)
    out.index = range(out.shape[0])
    return out

def prepare_graph(user_data, THRESHOLD = 3):
    """given the data for a user 
    prepare the graph. 
    """
    # print(user_data.head())
    # prepare the distance matrix. 
    dist_mat = pd.DataFrame(sp.distance_matrix(user_data.iloc[:, :3].values, 
                                               user_data.iloc[:, :3].values))

    cols = random.choices(list(mcolors.CSS4_COLORS.keys()), k =15)
    cols_dict = {}
    for i in range(1, 13):
        cols_dict[i] = cols[i]

    G = nx.Graph() 
    for i, row in user_data.iterrows(): 
        G.add_nodes_from([(i+1, {'features': row[:3]})])
                        
    for idx, row in dist_mat.iterrows(): 
        tmp = row.iloc[idx: ]
        # all elements close to row. First is default by itself. 
        neighbors = list(tmp[tmp <= THRESHOLD].index)

        for each_neighbor in neighbors[1: ]: 
            G.add_edge(idx, each_neighbor, weight = row[each_neighbor])

    return G

def write_node_attributes(G, dir): 
    __  = G.nodes.data()
    with open(os.path.join(dir, 'node_attributes.txt'), 'w') as f: 
        for each_node in __ : 
            if len(each_node) > 0: 
                ftr = each_node[1]['features'].values
                print(ftr)
                for each_line in ftr: 
                    f.writeline(each_line)
                f.writelines('\n')
    f.close()
     
def write_graph(G, dir): 
    """
    write a graph G into a directory dir. 
    """
    with open(os.path.join(dir, 'edge_list.txt'), 'w') as f :
        for line in nx.generate_edgelist(G, delimiter = ',', data = False ):
            f.writelines(line)
            f.writelines('\n')
            f.writelines(','.join(line.split(',')[::-1]))
            f.writelines('\n')
        f.close()

def prep_wisdm(num_sample, dist_thresh, train_prop): 
    print('Preparing Data. ')
    DATADIR = 'data\WISDM'
    for each_file in tqdm.tqdm(os.listdir(DATADIR)):
        if each_file not in ['wisdm_subject'+str(i) for i in [4, 7, 16, 20, 33, 35 ]]:
            # print(each_file)
            user = each_file.split('_')[1].split('.')[0][7:] 
            tmp = add_encoded_activity(each_file, DATADIR, sep =',')
            tmp1 = average_slice(tmp, num_sample)
            gr = prepare_graph(tmp1, dist_thresh)

            if user not in os.listdir('data\processed\wisdm'): 
                os.mkdir(os.path.join('data\processed\wisdm', user))
            
            tmp1.iloc[:, :3].to_csv(os.path.join('data\processed\wisdm', user, 'node_attributes' + '.txt'), 
                                    header = None, index = None)
            # prepare training mask. 
            ar = pd.DataFrame(np.random.uniform(0, 1,   
                                tmp1.shape[0]) >= 1 - train_prop, 
                                columns = ['train_mask'])

            tmp1['encoded_activity'].to_csv(os.path.join('data\processed\wisdm', user, 'node_labels' + '.txt'), 
                                            header = None, index = None)
            ar.to_csv(os.path.join('data\processed\wisdm', user, 'train_mask.txt'), 
                                            header = None, index = None)
            write_graph(gr, os.path.join('data\processed\wisdm', user))
    print('Data preparation finished. ')

num_sample = 128
dist_thresh = 1

In [6]:
DATADIR = 'data\WISDM'
counter = 1
out = []
for each_file in tqdm.tqdm(os.listdir(DATADIR)):
    if each_file not in ['wisdm_subject'+str(i) for i in [4, 7, 16, 20, 33, 35 ]]:
            user = each_file.split('_')[1].split('.')[0][7:] 
            tmp = add_encoded_activity(each_file, DATADIR, sep =',')
            tmp1 = average_slice(tmp, num_sample)
            out.append(tmp1)
            # gr = prepare_graph(tmp1, dist_thresh)
            # counter += 1
            # if counter == 3: 
            #     break
            # for i, grp in tmp1.groupby('encoded_activity'): 
            #     print(i)
            #     dist_mat = pd.DataFrame(sp.distance.cdist(grp.iloc[:, :3].values, 
            #                             grp.iloc[:, :3].values,
            #                             metric = 'mahalanobis'))
            #     x = dist_mat.values.reshape(1,-1).ravel()
            #     print(np.quantile(x, [0.5, 0.75, 0.9]))
            #     print('--')

100%|██████████| 36/36 [00:30<00:00,  1.17it/s]


In [8]:
finaldf = pd.concat(out)

In [10]:
finaldf.head(100)

Unnamed: 0,feature_1,feature_2,feature_3,user_id,encoded_activity
0,3.801953,9.931250,-0.510234,1,1
1,3.859609,9.889609,-0.556406,1,1
2,3.559687,9.974844,-0.704531,1,1
3,3.454062,10.361719,-0.654922,1,1
4,3.275781,10.053750,-0.509687,1,1
...,...,...,...,...,...
95,-7.918516,6.368281,-0.281797,1,1
96,-7.921172,6.595781,-0.326094,1,1
97,-8.019844,6.490156,-0.273047,1,1
98,-8.270703,6.349062,0.000391,1,1


In [11]:
finaldf.shape

(8596, 5)

In [13]:
walking = finaldf[finaldf['encoded_activity'] == 1]

In [14]:
walking_user1 = walking[walking['user_id'] == 1]

In [15]:
walking_user1

Unnamed: 0,feature_1,feature_2,feature_3,user_id,encoded_activity
0,3.801953,9.931250,-0.510234,1,1
1,3.859609,9.889609,-0.556406,1,1
2,3.559687,9.974844,-0.704531,1,1
3,3.454062,10.361719,-0.654922,1,1
4,3.275781,10.053750,-0.509687,1,1
...,...,...,...,...,...
135,-0.337500,10.204375,-0.148750,1,1
136,0.996094,10.686172,-0.263984,1,1
137,1.433437,10.531562,-0.298906,1,1
138,1.907422,10.755781,-0.060781,1,1
