# Notebook that preprocess the data
## Import modules

In [3]:
import numpy as np
import pandas as pd
import sys

In [4]:
def read_csv(csv_file):
    df = pd.read_csv(csv_file)
    return df

def apply_method(method, df, means, min_rssi, list_of_bs):
    if method == 'lat_lng_dummies':
        df_feat = lat_lng_dummies(df, list_of_bs, means, min_rssi)
    else:
        if method == 'rssi':
            df_feat = rssis(df, list_of_bs)
        elif method == 'rssis_filtered':
            df_feat = rssis_filtered(df, list_of_bs)
        else:
            df_feat = df
            print('lol')

    return df_feat


def get_means(df):
    return df[['bs_lat', 'bs_lng']].mean()


def compute_total_mean(train, val, test):
    train_size = len(train)
    val_size = len(val)
    test_size = len(test)

    means_train = get_means(train)
    means_val = get_means(val)
    means_test = get_means(test)

    total = (means_train * train_size + means_val * val_size + means_test * test_size) / (
        train_size + val_size + test_size)
    return total

def get_min(df):
    return df['rssi'].min()


def compute_total_min(train, val, test):
    total = min([get_min(train), get_min(val), get_min(test)])
    return total



 # METHOD 3
def rssis_filtered_old(df, list_of_bs, filter_threshold):
    df = rssi_filter(df, filter_threshold)
    df_feat = rssis(df, list_of_bs)
    return df_feat


def rssi_filter(df, filter_threshold=-140):
    
    print(df.head())
    df_feat = df[df.rssi >= filter_threshold]
    print(df_feat.head())
    return df_feat


def save_csv(df, name):
    df.to_csv(name, sep=';', index=False)


def ground_truth_const(df_mess, pos):
    df_mess_pos = df_mess.copy()
    df_mess_pos[['lat', 'lng']] = pos

    ground_truth_lat = df_mess_pos.groupby(['objid']).mean()['lat']
    ground_truth_lng = df_mess_pos.groupby(['objid']).mean()['lng']

    frames = [ground_truth_lat, ground_truth_lng]
    ground_truth = pd.concat(frames, axis=1)

    return ground_truth

# METHOD 1
def lat_lng_dummies(df, list_of_bs, means, min_rssi):
    list_lats = [str(bs) + '_lat' for bs in list_of_bs]
    list_lngs = [str(bs) + '_lng' for bs in list_of_bs]
    list_columns = list_lats + list_lngs + ['did']

    min_rssi = min_rssi
    mean_lats = means['bs_lat']
    mean_lngs = means['bs_lng']

    df_mess_bs_group = df.groupby(['objid'], as_index=False)  # group data by message (objid)
    messages = np.unique(df['objid'])
    nb_mess = len(messages)

    df_feat = pd.DataFrame(index=np.arange(nb_mess), columns=list_columns)

    df_feat.loc[:, :len(list_of_bs)] = mean_lats
    df_feat.loc[:, len(list_of_bs):2 * len(list_of_bs)] = mean_lngs
    idx = 0

    for key, elmt in df_mess_bs_group:
        lats = df_mess_bs_group.get_group(key)['bs_lat'].values
        lngs = df_mess_bs_group.get_group(key)['bs_lng'].values
        df_feat.loc[idx, 'did'] = df_mess_bs_group.get_group(key)['did'].values[0]
        for r, bsid in enumerate(df_mess_bs_group.get_group(key)['bsid'], 0):
            lat = str(bsid) + '_lat'
            lng = str(bsid) + '_lng'
            df_feat.loc[idx, lat] = lats[r] * min_rssi
            df_feat.loc[idx, lng] = lngs[r] * min_rssi
        idx = idx + 1
    return df_feat


# METHOD 2
def rssis(df, list_of_bs):
    list_columns = [str(bs) + '_rssi' for bs in list_of_bs] + ['did']

    df_mess_bs_group = df.groupby(['objid'], as_index=False)  # group data by message (objid)
    messages = np.unique(df['objid'])
    nb_mess = len(messages)

    df_feat = pd.DataFrame(np.zeros((nb_mess, len(list_columns))), columns=list_columns)

    idx = 0

    for key, elmt in df_mess_bs_group:
        values = df_mess_bs_group.get_group(key)['rssi'].values
        test = [v for v in values if v < -140]
        if len(values) == len(test):
            print(len(values), len(test))
        df_feat.loc[idx, 'did'] = df_mess_bs_group.get_group(key)['did'].values[0]
        for r, bsid in enumerate(df_mess_bs_group.get_group(key)['bsid'], 0):
            rssi = str(bsid) + '_rssi'
            df_feat.loc[idx, rssi] = values[r]
        idx = idx + 1
    return df_feat

In [5]:
def rssis_filtered(df, list_of_bs):
    list_columns = [str(bs) + '_rssi' for bs in list_of_bs] + ['did']

    print('list_columnms', list_of_bs[:2], '...')
    df_mess_bs_group = df.groupby(['objid'], as_index=False)  # group data by message (objid)
    print("df_mess_bs_group", df_mess_bs_group)
    messages = np.unique(df['objid'])
    print("messages", messages[:2], '...')
    nb_mess = len(messages)
    print("nb_mess", nb_mess)

    df_feat = pd.DataFrame(np.zeros((nb_mess, len(list_columns))), columns=list_columns)

    idx = 0

    for key, elmt in df_mess_bs_group:
        values = df_mess_bs_group.get_group(key)['rssi'].values
        test = [v for v in values if v < -140]
        if len(values) == len(test):
            print(len(values), len(test))
        df_feat.loc[idx, 'did'] = df_mess_bs_group.get_group(key)['did'].values[0]
        for r, bsid in enumerate(df_mess_bs_group.get_group(key)['bsid'], 0):
            rssi = str(bsid) + '_rssi'
            df_feat.loc[idx, rssi] = values[r]
        idx = idx + 1
    return df_feat

In [42]:
def rssis_filtered(df, list_of_bs):
    list_columns = [str(bs) + '_rssi' for bs in list_of_bs] + ['did']
    df_mess_bs_group = df.groupby(['objid'], as_index=False)  # group data by message (objid)

    nbObjid = len(set(df['objid'])) #number of messages
    df_feat = pd.DataFrame(np.zeros((nbObjid, len(list_columns))), columns=list_columns)

    idx = 0

    for objid, df_objid in df_mess_bs_group:    
        objid_rssis = df_objid['rssi'].values
        test = [r for r in objid_rssis if r < -140]

        if len(objid_rssis) == len(test):
            print(len(objid_rssis), len(test))
            #TODO To the fucking job

        df_feat.loc[idx, 'did'] = df_objid['did'].values[0]

        for r, bsid in enumerate(df_objid['bsid'], 0):

            col_name = str(bsid) + '_rssi'
            df_feat.loc[idx, col_name] = objid_rssis[r]
        idx = idx + 1
    return df_feat

## Main

In [6]:
path_data = "/home/bud/Documents/MyPrivateJoke/P2/INF730/data/"
files = ["train_X.csv", "val_X.csv", "test_X.csv", "train_y.csv", "val_y.csv", "rssis_filtered"]
#    Available methods : lat_lng_dummies, rssi
args = [path_data + file for file in files[:-1]]
args.append(files[-1])

In [7]:
# convert to DataFrames
train = read_csv(args[0])
val = read_csv(args[1])
test = read_csv(args[2])

y_train = read_csv(args[3])
y_val = read_csv(args[4])

In [8]:
# determine all Base stations that received at least 1 message
list_of_bs = np.union1d(np.union1d(np.unique(train['bsid']), np.unique(val['bsid'])), np.unique(test['bsid']))

means = compute_total_mean(train, val, test)
min_rssi = compute_total_min(train, val, test)

print("Original lengths:\n\ttrain:\t", len(train), "\n\tval:\t", len(val), "\n\ttest:\t", len(test))

gt_train = ground_truth_const(train, y_train)
gt_val = ground_truth_const(val, y_val)

# save_csv(gt_train, 'ground_truth_train.csv')
# save_csv(gt_val, 'ground_truth_val.csv')
# print("Length ground truths : ", len(gt_train), len(gt_val))

# get method
method = args[5]
print("\nMethod applied:", method)
df_train = apply_method(method, train, means, min_rssi, list_of_bs)
df_val = apply_method(method, val, means, min_rssi, list_of_bs)
df_test = apply_method(method, test, means, min_rssi, list_of_bs)

Original lengths:
	train:	 29805 
	val:	 9445 
	test:	 29286

Method applied: rssis_filtered
list_columnms [879 911] ...
df_mess_bs_group <pandas.core.groupby.DataFrameGroupBy object at 0x7f5c62321940>
messages ['573bf1d9864fce1a9af8c5c9' '573bf3533e952e19126b256a'] ...
nb_mess 5046
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
list_columnms [879 911] ...
df_mess_bs_group <pandas.core.groupby.DataFrameGroupBy object at 0x7f5c622af400>
messages ['582ae03712f1434b9cc93a71' '582ae1d712f1434b9ccad421'] ...
nb_mess 1022
1 1
list_columnms [879 911] ...
df_mess_bs_group <pandas.core.groupby.DataFrameGroupBy object at 0x7f5c62321ac8>
messages ['573be2503e952e191262c351' '573c05f83e952e1912758013'] ...
nb_mess 5294
1 1
1 1
1 1
1 1


In [9]:
list_columns = [str(bs) + '_rssi' for bs in list_of_bs] + ['did']


In [10]:
print("Formatting done.\nNew lengths : ", len(df_train), len(df_val), len(df_test))

print("\t| Columns length")
print("\t| ", len(df_train.columns.values), len(df_val.columns.values), len(df_test.columns.values))

# save
save_csv(df_train, path_data+'train_formatted_' + method + '.csv')
save_csv(df_val, path_data+'val_formatted_' + method + '.csv')
save_csv(df_test, path_data+'test_formatted_' + method + '.csv')

Formatting done.
New lengths :  5046 1022 5294
	| Columns length
	|  260 260 260
