In [1]:
import pandas as pd
import os, re
import glob
import zipfile
import skmob
from skmob.measures.individual import jump_lengths
from skmob.measures.individual import radius_of_gyration
from tqdm.notebook import tqdm
import numpy as np



Change the sample_data_path below as needed

In [2]:
sample_data_path = '/work/rwuirlab/Samples_MSAs'
zip_files = glob.glob(os.path.join(sample_data_path, '*.zip'))

you may not need to run the following if the processed data is ready

In [3]:
def read_concat_csv_from_zip(zip_file_path,n_csvs=2000):
    # Create an empty list to store dataframes
    dfs = []
    with zipfile.ZipFile(zip_file_path, 'r') as z:
        # Get a list of all CSV files in the ZIP
        csv_files = [f for f in z.namelist() if f.endswith('.csv')][:n_csvs]
        for csv_file in csv_files:
            # Open each CSV file and read it into a DataFrame
            with z.open(csv_file) as f:
                df = pd.read_csv(f,header=None)
                df = df[[1,3,4,5,10]]
                df.columns = ['user_id','lat','lon','acc','time']
                df['time'] = pd.to_datetime(df['time'])
                start_date = pd.Timestamp('2020-01-01 00:00:00')
                end_date = pd.Timestamp('2020-06-30 23:59:59')
                df = df[(df['time'] >= start_date) & (df['time'] <= end_date)]
                
                # Append the DataFrame to the list of DataFrames
                dfs.append(df)
    
    # # Concatenate all DataFrames in the list into a single DataFrame
    # # You can specify axis=0 for vertical stacking (rows), or axis=1 for horizontal stacking (columns)
    # # Use ignore_index=True to reindex the new DataFrame
    return pd.concat(dfs, axis=0, ignore_index=True)

In [4]:
def get_daily_metrics(df):
    df['date'] = df['time'].dt.date
    # number of records each day each user
    num_of_records_df = df.groupby(['user_id','date']).size().reset_index()
    num_of_records_df.columns = ['user_id','date','num_of_records']
    # temporal occupancy each day each user
    df['half_hour_index'] = df['time'].dt.hour * 2 + df['time'].dt.minute // 30
    df = df.drop_duplicates(['user_id','date','half_hour_index'])
    temporal_occupancy_df = df.groupby(['user_id','date']).size().reset_index()
    temporal_occupancy_df.columns = ['user_id','date','intra_day_temporal_occupancy']
    # merge
    merge_df = num_of_records_df.merge(temporal_occupancy_df,how='left',on=['user_id','date'])
    return merge_df

In [5]:
def get_longterm_metrics(df):
    # high acc rate
    high_acc_df = df.groupby('user_id')['acc'].apply(lambda x: (x < 100).mean()).reset_index()
    high_acc_df.columns = ['user_id','acc_rate']
    # radius of gyration
    tdf = skmob.TrajDataFrame(df, latitude='lat', longitude='lon', datetime='time', user_id='user_id')
    radius_of_gyration_df = radius_of_gyration(tdf,False)
    radius_of_gyration_df.columns = ['user_id','radius_of_gyration']
    # Euclidean distance mean
    distance_mean_df = jump_lengths(tdf,False)
    distance_mean_df['jump_lengths'] = distance_mean_df.jump_lengths.apply(lambda x: np.mean(x) if len(x) > 0 else np.nan)
    distance_mean_df.columns = ['user_id','euclidean_distance_mean']
    # merge
    merge_df = pd.merge(pd.merge(high_acc_df, radius_of_gyration_df, on='user_id'), distance_mean_df, on='user_id')
    return merge_df

In [6]:
def write_to_csv(dataframe, output_path, mode='a', header=False):
    if not os.path.exists(output_path) or mode == 'w':
        header = True
    dataframe.to_csv(output_path, mode=mode, header=header, index=False)

In [7]:
# os.remove('Metrics/daily_metrics.csv')
# os.remove('Metrics/longterm_metrics.csv')

for zip_file in tqdm(zip_files):
    if 'Sample_MSAs_Ind' in zip_file:
        continue
    match = re.search(r'/([^/]+)\.zip$', zip_file)
    MSA = match.group(1)
    
    df = read_concat_csv_from_zip(zip_file,2000)
    daily_metrics_df = get_daily_metrics(df)
    daily_metrics_df['MSA'] = MSA
    write_to_csv(daily_metrics_df,'Metrics/daily_metrics.csv')
    longterm_metrics_df = get_longterm_metrics(df)
    longterm_metrics_df['MSA'] = MSA
    write_to_csv(longterm_metrics_df,'Metrics/longterm_metrics.csv')

  0%|          | 0/12 [00:00<?, ?it/s]