In [1]:
# Take a csv file that has signals data and basically sample from it a bunch of datapoints however we want
# Could be in increments of time intervals 
# Basically say in the past time
# Take a activity dataset and basically sample however we want from it

# Load Signal and Activity Dataset

In [2]:
import pandas as pd

BASE_NUM = "21"
SIGNAL_PATH = f"./datasets/processed_sig{BASE_NUM}.txt"
ACTIVITY_PATH = f"./datasets/processed_act{BASE_NUM}.txt"

def load_signal_csv(csv_path):
    df = pd.read_csv(csv_path)
    # Remove columns that have invalid data
    clean_df = df.loc[:, ~df.columns.isin(['rssnr', 'cqi','ta'])]
    return clean_df

signal_df = load_signal_csv(SIGNAL_PATH)


def load_activity_csv(path):
    # This dataframe contains time, activity, status(enter/exit)
    activity_df = pd.read_csv(path)
    # Ignore EXIT state since we only care about current state
    activity_df = activity_df[activity_df['status'] != "EXIT"]
    return activity_df

activity_df = load_activity_csv(ACTIVITY_PATH)

In [3]:
signal_df.head()


Unnamed: 0,time,mRegistered,mTimeStamp,mPci,mTac,mEarfcn,mMcc+mMnc,ss,rsrp,rsrq
0,12-03 19:48:28.423,YES,12319826322802,116,16185,675,310260,22,-98,-9
1,12-03 19:48:28.423,NO,12319826322802,44,2147483647,675,nullnull,16,-110,-18
2,12-03 19:48:28.650,NO,12319826322802,44,2147483647,675,nullnull,16,-110,-18
3,12-03 19:48:28.681,YES,12319826322802,116,16185,675,310260,22,-98,-9
4,12-03 19:48:28.681,NO,12319826322802,44,2147483647,675,nullnull,16,-110,-18


In [4]:
activity_df.head()

Unnamed: 0,time,activity,status
0,12-03 19:48:33.262,STILL,ENTER
2,12-03 19:52:59.581,WALKING,ENTER
4,12-03 19:53:39.655,STILL,ENTER
6,12-03 19:54:05.397,WALKING,ENTER
8,12-03 20:01:22.471,STILL,ENTER


# Merge Datasets

In [5]:
def search_activity_status_at_time(query_time, activity_df):

    # Searches the activity_df to get the
    # state of the device at time
    
    no_value_string = "NOVALUE"
    last_observed_state = no_value_string
    activity_time_passed_query_time = False
    
    for _,row in activity_df.iterrows():
        if query_time < row['time']:
            # Passed the state we wanted
            activity_time_passed_query_time = True
            break
        
        last_observed_state = row['activity']
    
    if not activity_time_passed_query_time:
        # time is sometime in the future after acitivity data
        last_observed_state = no_value_string
    
    return last_observed_state

# Example of searching for activity at specific time
# search_activity_status_at_time("11-06 16:50:26.900", activity_df)
# search_activity_status_at_time("11-07 16:50:26.900", activity_df)
def create_merged_dataset(signal_df, activity_df):
    merged_df = signal_df.copy()
    merged_df['activity'] = "INVALID"
    cached_result = {}
    for i,row, in signal_df.iterrows():
        time = row['time']
        if time not in cached_result:
            cached_result[time] = search_activity_status_at_time(row['time'], activity_df)
        merged_df.loc[i,'activity'] = cached_result[time]
    return merged_df

In [None]:
merged_df = create_merged_dataset(signal_df, activity_df)
merged_df.head()

# Drop Duplicates Using mTimeStamp and mPci

Now that we have merged the two datasets lets drop any duplicates identified using the mTimestamp


In [None]:
length_before = len(merged_df)
df_no_duplicates = merged_df.drop_duplicates(subset=["mTimeStamp", "mPci"]).reset_index(drop=True)
print(f"Trimmed from {length_before} to {len(df_no_duplicates)}")

# Mark Points with Handover 

In [None]:
def get_max_at_time(df, time):
    timestep_df = df[df['time'] == time].reset_index(drop=True)
    timestep_df = timestep_df[timestep_df['mRegistered'] == "NO"]

    if len(timestep_df) == 0:
        return {
            "max_ss": 0,
            "max_rsrp": -1000,
            "max_rsrq": -1000,
        }

    ret = {
        "max_ss": max(timestep_df['ss'].values),
        "max_rsrp": max(timestep_df['rsrp'].values),
        "max_rsrq": max(timestep_df['rsrq'].values),
    }
    return ret

merged_registered_df = df_no_duplicates[df_no_duplicates['mRegistered'] == "YES"].reset_index(drop=True)
merged_registered_df['nextAP'] = merged_registered_df['mPci'].shift(-1)
merged_registered_df = merged_registered_df[:-1]
def label_handover_occured(row):
    if row['mPci'] == row['nextAP']:
        return 0
    return 1
merged_registered_df['handoverOccured'] = merged_registered_df.apply(lambda row: label_handover_occured(row), axis=1)
merged_registered_df['max_ss'] = merged_registered_df.apply(lambda row: get_max_at_time(df_no_duplicates, row['time'])['max_ss'], axis=1)
merged_registered_df['max_rsrp'] = merged_registered_df.apply(lambda row: get_max_at_time(df_no_duplicates, row['time'])['max_rsrp'], axis=1)
merged_registered_df['max_rsrq'] = merged_registered_df.apply(lambda row: get_max_at_time(df_no_duplicates, row['time'])['max_rsrq'], axis=1)

print(merged_registered_df['handoverOccured'].value_counts())
merged_registered_df.head()

In [None]:
# Add secondary features showing difference between selected AP and other APs
merged_registered_df['diff_in_ss'] = merged_registered_df['ss'] - merged_registered_df['max_ss']
merged_registered_df['diff_in_rsrp'] = merged_registered_df['rsrp'] - merged_registered_df['max_rsrp']
merged_registered_df['diff_in_rsrq'] = merged_registered_df['rsrq'] - merged_registered_df['max_rsrq']


In [None]:
merged_registered_df[merged_registered_df['handoverOccured'] == 1]

In [None]:
merged_registered_df.loc[65:75]

In [None]:
merged_registered_df.to_csv(f"./datasets/merged{BASE_NUM}.txt", index=False)