In [1]:
# Take a csv file that has signals data and basically sample from it a bunch of datapoints however we want
# Could be in increments of time intervals 
# Basically say in the past time
# Take a activity dataset and basically sample however we want from it

# Load Signal and Activity Dataset

In [2]:
import pandas as pd

BASE_NUM = "21"
SIGNAL_PATH = f"./datasets/processed_sig{BASE_NUM}.txt"
ACTIVITY_PATH = f"./datasets/processed_act{BASE_NUM}.txt"

def load_signal_csv(csv_path):
    df = pd.read_csv(csv_path)
    # Remove columns that have invalid data
    clean_df = df.loc[:, ~df.columns.isin(['rssnr', 'cqi','ta'])]
    return clean_df

signal_df = load_signal_csv(SIGNAL_PATH)


def load_activity_csv(path):
    # This dataframe contains time, activity, status(enter/exit)
    activity_df = pd.read_csv(path)
    # Ignore EXIT state since we only care about current state
    activity_df = activity_df[activity_df['status'] != "EXIT"]
    return activity_df

activity_df = load_activity_csv(ACTIVITY_PATH)

In [3]:
signal_df.head()


Unnamed: 0,time,mRegistered,mTimeStamp,mPci,mTac,mEarfcn,mMcc+mMnc,ss,rsrp,rsrq
0,12-03 19:48:28.423,YES,12319826322802,116,16185,675,310260,22,-98,-9
1,12-03 19:48:28.423,NO,12319826322802,44,2147483647,675,nullnull,16,-110,-18
2,12-03 19:48:28.650,NO,12319826322802,44,2147483647,675,nullnull,16,-110,-18
3,12-03 19:48:28.681,YES,12319826322802,116,16185,675,310260,22,-98,-9
4,12-03 19:48:28.681,NO,12319826322802,44,2147483647,675,nullnull,16,-110,-18


In [4]:
activity_df.head()

Unnamed: 0,time,activity,status
0,12-03 19:48:33.262,STILL,ENTER
2,12-03 19:52:59.581,WALKING,ENTER
4,12-03 19:53:39.655,STILL,ENTER
6,12-03 19:54:05.397,WALKING,ENTER
8,12-03 20:01:22.471,STILL,ENTER


# Merge Datasets

In [5]:
def search_activity_status_at_time(query_time, activity_df):

    # Searches the activity_df to get the
    # state of the device at time
    
    no_value_string = "NOVALUE"
    last_observed_state = no_value_string
    activity_time_passed_query_time = False
    
    for _,row in activity_df.iterrows():
        if query_time < row['time']:
            # Passed the state we wanted
            activity_time_passed_query_time = True
            break
        
        last_observed_state = row['activity']
    
    if not activity_time_passed_query_time:
        # time is sometime in the future after acitivity data
        last_observed_state = no_value_string
    
    return last_observed_state

# Example of searching for activity at specific time
# search_activity_status_at_time("11-06 16:50:26.900", activity_df)
# search_activity_status_at_time("11-07 16:50:26.900", activity_df)
def create_merged_dataset(signal_df, activity_df):
    merged_df = signal_df.copy()
    merged_df['activity'] = "INVALID"
    cached_result = {}
    for i,row, in signal_df.iterrows():
        time = row['time']
        if time not in cached_result:
            cached_result[time] = search_activity_status_at_time(row['time'], activity_df)
        merged_df.loc[i,'activity'] = cached_result[time]
    return merged_df

In [6]:
merged_df = create_merged_dataset(signal_df, activity_df)
merged_df.head()

Unnamed: 0,time,mRegistered,mTimeStamp,mPci,mTac,mEarfcn,mMcc+mMnc,ss,rsrp,rsrq,activity
0,12-03 19:48:28.423,YES,12319826322802,116,16185,675,310260,22,-98,-9,NOVALUE
1,12-03 19:48:28.423,NO,12319826322802,44,2147483647,675,nullnull,16,-110,-18,NOVALUE
2,12-03 19:48:28.650,NO,12319826322802,44,2147483647,675,nullnull,16,-110,-18,NOVALUE
3,12-03 19:48:28.681,YES,12319826322802,116,16185,675,310260,22,-98,-9,NOVALUE
4,12-03 19:48:28.681,NO,12319826322802,44,2147483647,675,nullnull,16,-110,-18,NOVALUE


# Drop Duplicates Using mTimeStamp and mPci

Now that we have merged the two datasets lets drop any duplicates identified using the mTimestamp


In [7]:
length_before = len(merged_df)
df_no_duplicates = merged_df.drop_duplicates(subset=["mTimeStamp", "mPci"]).reset_index(drop=True)
print(f"Trimmed from {length_before} to {len(df_no_duplicates)}")

Trimmed from 62948 to 6553


# Mark Points with Handover 

In [8]:
def get_max_at_time(df, time):
    timestep_df = df[df['time'] == time].reset_index(drop=True)
    timestep_df = timestep_df[timestep_df['mRegistered'] == "NO"]

    if len(timestep_df) == 0:
        return {
            "max_ss": 0,
            "max_rsrp": -1000,
            "max_rsrq": -1000,
        }

    ret = {
        "max_ss": max(timestep_df['ss'].values),
        "max_rsrp": max(timestep_df['rsrp'].values),
        "max_rsrq": max(timestep_df['rsrq'].values),
    }
    return ret

merged_registered_df = df_no_duplicates[df_no_duplicates['mRegistered'] == "YES"].reset_index(drop=True)
merged_registered_df['nextAP'] = merged_registered_df['mPci'].shift(-1)
merged_registered_df = merged_registered_df[:-1]
def label_handover_occured(row):
    if row['mPci'] == row['nextAP']:
        return 0
    return 1
merged_registered_df['handoverOccured'] = merged_registered_df.apply(lambda row: label_handover_occured(row), axis=1)
merged_registered_df['max_ss'] = merged_registered_df.apply(lambda row: get_max_at_time(df_no_duplicates, row['time'])['max_ss'], axis=1)
merged_registered_df['max_rsrp'] = merged_registered_df.apply(lambda row: get_max_at_time(df_no_duplicates, row['time'])['max_rsrp'], axis=1)
merged_registered_df['max_rsrq'] = merged_registered_df.apply(lambda row: get_max_at_time(df_no_duplicates, row['time'])['max_rsrq'], axis=1)

print(merged_registered_df['handoverOccured'].value_counts())
merged_registered_df.head()

0    1966
1      62
Name: handoverOccured, dtype: int64


Unnamed: 0,time,mRegistered,mTimeStamp,mPci,mTac,mEarfcn,mMcc+mMnc,ss,rsrp,rsrq,activity,nextAP,handoverOccured,max_ss,max_rsrp,max_rsrq
0,12-03 19:48:28.423,YES,12319826322802,116,16185,675,310260,22,-98,-9,NOVALUE,116.0,0,16,-110,-18
1,12-03 19:48:29.065,YES,12322022004687,116,16185,675,310260,24,-98,-12,NOVALUE,116.0,0,17,-106,-17
2,12-03 19:48:31.514,YES,12324507411601,116,16185,675,310260,23,-98,-11,NOVALUE,116.0,0,17,-106,-17
3,12-03 19:48:34.015,YES,12326827844176,116,16185,675,310260,24,-96,-11,STILL,116.0,0,17,-107,-17
4,12-03 19:48:36.040,YES,12329017650748,116,16185,675,310260,24,-96,-12,STILL,116.0,0,19,-107,-20


In [9]:
# Add secondary features showing difference between selected AP and other APs
merged_registered_df['diff_in_ss'] = merged_registered_df['ss'] - merged_registered_df['max_ss']
merged_registered_df['diff_in_rsrp'] = merged_registered_df['rsrp'] - merged_registered_df['max_rsrp']
merged_registered_df['diff_in_rsrq'] = merged_registered_df['rsrq'] - merged_registered_df['max_rsrq']


In [10]:
merged_registered_df[merged_registered_df['handoverOccured'] == 1]

Unnamed: 0,time,mRegistered,mTimeStamp,mPci,mTac,mEarfcn,mMcc+mMnc,ss,rsrp,rsrq,activity,nextAP,handoverOccured,max_ss,max_rsrp,max_rsrq,diff_in_ss,diff_in_rsrp,diff_in_rsrq
73,12-03 19:54:17.524,YES,12487990136168,116,16185,2300,310260,21,-109,-17,WALKING,99.0,1,15,-101,-8,6,-8,-9
91,12-03 19:55:02.135,YES,12532568120245,99,16185,2300,310260,26,-106,-20,WALKING,44.0,1,18,-106,-14,8,0,-6
181,12-03 19:59:33.516,YES,12732379030306,44,16185,2300,310260,28,-94,-17,WALKING,434.0,1,22,-91,-11,6,-3,-6
193,12-03 20:00:34.034,YES,12748999912639,434,16185,2300,310260,31,-86,-13,WALKING,44.0,1,26,-88,-15,5,2,2
216,12-03 20:02:24.923,YES,12779714341792,44,16185,2300,310260,27,-93,-12,WALKING,434.0,1,17,-101,-13,10,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1873,12-04 01:23:16.636,YES,15961524233343,434,16185,675,310260,31,-82,-14,WALKING,44.0,1,28,-82,-15,3,0,1
1876,12-04 01:23:24.126,YES,15969031537267,44,16185,2300,310260,30,-87,-14,WALKING,434.0,1,28,-81,-14,2,-6,0
1888,12-04 01:24:07.662,YES,15994169130918,434,16185,2300,310260,31,-86,-20,WALKING,44.0,1,30,-74,-10,1,-12,-10
1976,12-04 01:30:40.074,YES,16133349413220,44,16185,2300,310260,23,-108,-19,WALKING,99.0,1,16,-107,-14,7,-1,-5


In [11]:
merged_registered_df.loc[65:75]

Unnamed: 0,time,mRegistered,mTimeStamp,mPci,mTac,mEarfcn,mMcc+mMnc,ss,rsrp,rsrq,activity,nextAP,handoverOccured,max_ss,max_rsrp,max_rsrq,diff_in_ss,diff_in_rsrp,diff_in_rsrq
65,12-03 19:53:58.513,YES,12468938449166,116,16185,2300,310260,21,-101,-10,STILL,116.0,0,14,-109,-15,7,8,5
66,12-03 19:54:00.997,YES,12471444193531,116,16185,2300,310260,21,-104,-11,STILL,116.0,0,15,-109,-15,6,5,4
67,12-03 19:54:03.481,YES,12473942486436,116,16185,2300,310260,20,-108,-12,STILL,116.0,0,13,-113,-14,7,5,2
68,12-03 19:54:05.936,YES,12476064782585,116,16185,2300,310260,21,-104,-9,WALKING,116.0,0,15,-104,-11,6,0,2
69,12-03 19:54:07.918,YES,12478420867090,116,16185,2300,310260,21,-107,-11,WALKING,116.0,0,15,-107,-15,6,0,4
70,12-03 19:54:09.991,YES,12480453102345,116,16185,2300,310260,20,-108,-15,WALKING,116.0,0,13,-112,-14,7,4,-1
71,12-03 19:54:12.498,YES,12482959232959,116,16185,2300,310260,19,-108,-12,WALKING,116.0,0,13,-112,-15,6,4,3
72,12-03 19:54:14.983,YES,12485447307686,116,16185,2300,310260,20,-107,-12,WALKING,116.0,0,14,-104,-9,6,-3,-3
73,12-03 19:54:17.524,YES,12487990136168,116,16185,2300,310260,21,-109,-17,WALKING,99.0,1,15,-101,-8,6,-8,-9
74,12-03 19:54:20.010,YES,12490479250948,99,16185,2300,310260,25,-93,-8,WALKING,99.0,0,17,-107,-16,8,14,8


In [12]:
merged_registered_df.to_csv(f"./datasets/merged{BASE_NUM}.txt", index=False)