# Load the data

In [1]:
%load_ext autotime

import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
# pd.options.display.max_columns
pd.set_option("display.max_colwidth",200)
pd.set_option("display.max_columns",20)
pd.set_option('float_format', '{:.3f}'.format)

# Load Usage data
with open('App_usage_trace.txt') as f:
    lines = f.readlines()#[:100000]
    df = []
    for line in lines:
        data = line.split()
        df.append(data)
usage = pd.DataFrame(df, columns=['uid','timestamp', 'loc', 'app_id','traffic'])
# output_user(usage)
usage['traffic'] = usage['traffic'].astype('float64') / 1e6 # Convert traffic to MB
usage['timestamp'] = usage['timestamp'].apply(lambda x: datetime.strptime(x, "%Y%m%d%H%M%S")) # Convert to datetime object
usage['loc'] = usage['loc'].astype('int64')

# Load App2Category data
with open('App2Category.txt') as f:
    lines = f.readlines()
    df = []
    for line in lines:
        data = line.split()
        df.append(data)
app2cat = pd.DataFrame(df, columns=['app_id','cat_id'])

# Load base station POI data
base_poi = pd.read_csv("base_poi.txt", delimiter='\t')

# Load Category dictionary
cat = pd.read_csv("Categorys.txt", delimiter='\t', header=None)
cat.columns = ['cat_id','category']
cat.set_index('cat_id', inplace=True)


print("< ---- Basic information ---- >")
print("- {} base stations. {} entries".format(base_poi.shape[0], usage.shape[0]))
print()
print("< ---- Unique users ---- >")
print("- {} unique users".format(len(usage['uid'].unique())))



< ---- Basic information ---- >
- 9851 base stations. 4171950 entries

< ---- Unique users ---- >
- 871 unique users


In [2]:
def output_user(data):
    # group data into users
    grouped_users = data.groupby('uid')
    for user, group in grouped_users:
        # iterare each user and output to seperate file
        group.to_csv('user_{}.txt'.format(user), header=False, index=False, sep=' ')

time: 601 µs


In [3]:
# Load base station POI data
base_poi = pd.read_csv("base_poi.txt", delimiter='\t')

base_poi.set_index('BaseID', inplace=True)
base_poi['total'] = base_poi.sum(axis=1)

time: 46.9 ms


In [4]:
# Group user count to find heavy users uid later
user_count = usage.groupby(["uid"]).size().reset_index()
user_count.columns = ['uid','count']

time: 762 ms


## Just to use some simple rules to remove users / base stations

In [5]:
def remove_records(n_user=2000, n_base=5):
    # Remove user with less than n records
    user_count_clean = user_count[user_count['count'] > n_user]
    print('='*50)
    print("We remove users with less than {} records".format(n_user))
    print("-"*50)
    print("# of base station before cleaning:", user_count.shape[0])
    print("# of users after cleaning:", user_count_clean.shape[0])
    print()
    # Store the uid that we removed
    removed_user = user_count[user_count['count'] < n_user].index
    removed_user = list(removed_user)
    print("Removed {} users and returned a list of removed uid".format(len(removed_user)))
    print()
    print('='*50)
    # Remove Base station with less than n POI
    print("We remove base station with less than {} POI".format(n_base))
    print("-"*50)

    print("# of base station before cleaning:", base_poi.shape[0])
    base_poi_clean = base_poi[base_poi['total'] > n_base]
    print("# of base station after cleaning:", base_poi_clean.shape[0])
    print()
    # Store the baseID we are removing
    removed_base = base_poi[base_poi['total'] < n_base].index
    removed_base = list(removed_base)
    print("Removed {} base and returned a list of removed baseID".format(len(removed_base)))

    # Remove the records that match either the removed user or removed baseID
    # Return the dataset

    return removed_user, removed_base


time: 2.5 ms


In [6]:
removed_user, removed_base = remove_records(1000, 20)

We remove users with less than 1000 records
--------------------------------------------------
# of base station before cleaning: 871
# of users after cleaning: 560

Removed 311 users and returned a list of removed uid

We remove base station with less than 20 POI
--------------------------------------------------
# of base station before cleaning: 9851
# of base station after cleaning: 5768

Removed 3947 base and returned a list of removed baseID
time: 17.7 ms


In [7]:
def output_user(data):
    grouped_users = data.groupby('uid')
    for user, group in grouped_users:
        group.to_csv('user_{}.txt'.format(user), header=False, index=False, sep=' ')

time: 633 µs


# Nathans Approach

In [8]:
# # Function to align the Origin and Destination (O/D)
# def sort_movement_tuples(jump):
#     return sorted(list(jump))

# # Define a function that can output the frequency pair of a user's mobility pattern
# def show_frequent_pair(uid=888, threshold=5, jump_occurence=10):
#     # Nathan: replaced usage with usage_cleaned
#     user_uid = usage[usage['uid'] == str(uid)]

#     # Set the thresold of time gap allowed between movement
#     delta_threshold = timedelta(seconds=threshold)

#     # Create the columns of the next time and lcoation
#     user_uid['next_loc'] = user_uid['loc'].shift(-1).fillna(0).astype('int')
#     user_uid['next_timestamp'] = user_uid['timestamp'].shift(-1)

#     # Keep only movements
#     user_uid = user_uid[user_uid['loc'] != user_uid['next_loc']]

#     # Get the time gaps between movements
#     user_uid['time_delta'] = user_uid['next_timestamp'] - user_uid['timestamp']

#     # Get teleports and count the teleports
#     # Should be able to identify the return trips
#     teleports = user_uid[user_uid['time_delta'] < delta_threshold]
#     teleports['tele'] = list(zip(teleports['loc'], teleports['next_loc']))
#     # print(teleports['tele'].value_counts())

#     # # Group and count the same jump
#     tele_counts = teleports['tele'].value_counts().reset_index()
#     tele_counts.columns = ['jump', 'count']

#     # Align the O/D of the jump to identify the pairs
#     tele_counts['jump'] = tele_counts['jump'].apply(lambda i: sort_movement_tuples(i))

#     # Get the jumps that has a reasonable number of occurence
#     tele_counts = tele_counts[tele_counts['count'] > jump_occurence]
#     tele_counts['jump'] = tuple(tele_counts['jump'])

#     # print the record for reference
#     # Nathan: added reset_index()
#     frequent_pairs = tele_counts.groupby(['jump']).sum().reset_index()

#     return frequent_pairs

time: 3.64 ms


In [9]:
# Function to align the Origin and Destination (O/D)
def sort_movement_tuples(jump):
    return sorted(list(jump))

# show the period where the oscillation happens
def present_oscillation(uid=888, threshold=5, jump_occurence=10):
    # Nathan: replaced usage with usage_cleaned
    user_uid = usage[usage['uid'] == str(uid)]

    # Set the thresold of time gap allowed between movement
    delta_threshold = timedelta(seconds=threshold)

    # Create the columns of the next time and lcoation
    user_uid['next_loc'] = user_uid['loc'].shift(-1).fillna(0).astype('int')
    user_uid['next_timestamp'] = user_uid['timestamp'].shift(-1)

    # Keep only movements
    user_uid = user_uid[user_uid['loc'] != user_uid['next_loc']]

    # Get the time gaps between movements
    user_uid['time_delta'] = user_uid['next_timestamp'] - user_uid['timestamp']

    # Get teleports and count the teleports
    # Should be able to identify the return trips
    teleports = user_uid[user_uid['time_delta'] < delta_threshold]
    teleports['tele'] = list(zip(teleports['loc'], teleports['next_loc']))
    # teleports['tele'] = teleports['tele'].apply(lambda i: sort_movement_tuples(i))
    # print(teleports['tele'].value_counts())

    # # Group and count the same jump
    tele_counts = teleports['tele'].value_counts().reset_index()
    tele_counts.columns = ['jump', 'count']
    tele_counts = tele_counts[tele_counts['count'] > jump_occurence]


    # # Align the O/D of the jump to identify the pairs
    # tele_counts['jump'] = tele_counts['jump'].apply(lambda i: sort_movement_tuples(i))

    # Get the jumps that has a reasonable number of occurence
    tele_counts = tele_counts[tele_counts['count'] > jump_occurence]
    # tele_counts['jump'] = tuple(tele_counts['jump'])

    #return the rows that has tele_count in the "tele" column, so it returns the rows where the oscillation happens
    return teleports[teleports['tele'].isin(tele_counts["jump"])]


time: 1.44 ms


In [10]:
# # show the oscillation period
# user_oscillations = present_oscillation(979,10,5)
# # user_oscillations.iloc[:50,:]
# # user_oscillations.drop(columns=['next_timestamp','time_delta'])
# # Create the columns of the next time and lcoation
# user_oscillations['next_timestamp'] = user_oscillations['timestamp'].shift(-1)
# user_oscillations['time_delta'] = user_oscillations['next_timestamp'] - user_oscillations['timestamp']
# user_oscillations.iloc[:50,:]

        

time: 341 µs


In [11]:
# delta_threshold = timedelta(seconds=300)
# count = 0
# oscillation_list = []
# start = True
# for index, request in user_oscillations.iterrows():
#     if start:
#         start_index = index
#         start = False
#     # print(request['time_delta'])
#     if request['time_delta'] > delta_threshold:
#         end_index = index
#         # print(index)
#         # oscillation_list.append([(start_index, end_index), request['tele']])
#         oscillation_list.append((start_index, end_index))
#         start = True

# oscillation_list[0:21]        

time: 909 µs


In [12]:
# def replace_base(data, loc_list):
#     # set count to only run a small sample
#     count = 0
#     for oscillation_period, bases in loc_list:
#         start = oscillation_period[0]
#         end = oscillation_period[1]
#         x = data.loc[start:end + 6,'loc']
#         # if count == 5:
#             # break
#         # if count % 5 == 0:
#         #     print("iteration:", count)
#         #     print("Counts:\n", x.value_counts())
#         #     # print(bases)
#         #     print("Before:\n", x)
#         #https://stackoverflow.com/questions/47136436/python-pandas-convert-value-counts-output-to-dataframe
#         base_counts = x.value_counts().rename_axis('loc').reset_index(name='counts')
#         # https://stackoverflow.com/questions/37841525/correct-way-to-set-value-on-a-slice-in-pandas
#         base_1 = base_counts.loc[base_counts['loc'] == bases[0], 'counts'].iloc[0]
#         base_2 = base_counts.loc[base_counts['loc'] == bases[1], 'counts'].iloc[0]
#         # further work is to get list of base_poi in value_count. find greatest amongst list and change.
#         if base_2 > base_1:
#             # x.loc[x['loc'] == bases[0], 'loc'] = bases[1]
#             data.loc[start:end,'loc'] = bases[1]
#             # replace base 1
#         else:
#             # x.loc[x['loc'] == bases[1],'loc'] = bases[0]
#             data.loc[start:end,'loc'] = bases[0]
#             # replace base 2
#             # also repalce base 2 if equal since base 1 came first
#         # if count % 5 == 0:
#         #     print("After:\n", x)
#         count += 1


#         # # print(base_counts.iloc[[bases[0]]])
#         # print(base_counts.index.values)
#         # # print(bases[0], bases[1])
#         # # https://stackoverflow.com/questions/36684013/extract-column-value-based-on-another-column-pandas-dataframe
#         # print(base_counts.loc[base_counts['loc'] == bases[0], 'counts'].iloc[0])
#         # print(list(bases))
#         # # print(base_counts.iloc[list(bases)])
#         # print(x['loc'].value_counts())


#         # # https://moonbooks.org/Articles/How-to-extract-the-value-names-and-counts-from-valuecounts-in-pandas-/
#         # # for idx,location in enumerate(base_counts.index.tolist()):
#         #     if bases[0]
#             # print('Name :', location)
#             # print('Counts :', base_counts.iloc[idx])

time: 959 µs


In [13]:
# working_usage = usage.copy()
# replace_base(working_usage, oscillation_list)

time: 2.41 ms


In [14]:
from heapq import nlargest

def replace_base1(data, loc_list):
    for oscillation_period in loc_list:
        # get starting index
        start = oscillation_period[0]
        # get ending index of oscillation period
        end = oscillation_period[1]
        # slice dataset to get requests within oscilation period
        x = data.loc[start:end+1,'loc']
        # get the count of each base station. 
        #   rename axis and reset index to get row index as column
        base_counts = x.value_counts().rename_axis('loc').reset_index(name='counts')
        # get list of unique base stations in oscilation period
        base_stations = base_counts['loc'].tolist()
        # get inital count of each base station within the oscillation period
        base_station_counts = base_counts['counts'].tolist()
        # get difference between the top 2 base stations
        cur_diff = nlargest(2, base_station_counts)[0] - nlargest(2, base_station_counts)[1]
        # we only declare one base station as majority if it has a count greater than 5 to the next greatest
        i =1
        while cur_diff < 5:
            # we take into account of the next surrounding requests outside of the oscillation period
            # this will continue to increase by 1 request at each side
            x = data.loc[start - i :end + i,'loc']

            # get new count of base stations in oscillation period
            base_counts = x.value_counts().rename_axis('loc').reset_index(name='counts')
            # output count as list
            base_station_counts = base_counts['counts'].tolist()
            # get the difference
            cur_diff = nlargest(2, base_station_counts)[0] - nlargest(2, base_station_counts)[1]
            i += 1

        # assign variable with the majority base station in oscillation period
        largest_station = base_stations[base_station_counts.index(max(base_station_counts))]

        # manipulate original dataframe to make changes perm 
        data.loc[start:end+1,'loc'] = largest_station


time: 1.08 ms


In [15]:
# working_usage = usage.copy()
# replace_base1(working_usage, oscillation_list)

time: 823 µs


In [16]:
import warnings
warnings.filterwarnings('ignore')
# get list of all unqiue users in dataset
user_list = usage['uid'].unique()
# copy orignal dataset
# this is for comparison purposes
working_usage = usage.copy()
for uid in user_list: 
    # get all oscillations from current user
    # set threshold as 10 seconds and atleast 5 oscillation pairs
    user_oscillations = present_oscillation(uid,10,5)

    # Create the columns of the next time and lcoation between frequent pairs
    user_oscillations['next_timestamp'] = user_oscillations['timestamp'].shift(-1)
    user_oscillations['time_delta'] = user_oscillations['next_timestamp'] - user_oscillations['timestamp']

    # set delta threshold between oscillation periods
    delta_threshold = timedelta(seconds=300)
    # declare list to hold indicies of oscillation periods
    oscillation_list = []

    start = True
    for index, request in user_oscillations.iterrows():
        # set the first index
        if start:
            start_index = index
            start = False

        # if delta to next oscillation request is greater than threshold then indicate this is the last index of current period
        if request['time_delta'] > delta_threshold:
            # set end index for current period
            end_index = index
            # append tuple (starting index, ending index) of the oscillation period
            oscillation_list.append((start_index, end_index))
            # declare the next request will be the start of the next oscillation period
            start = True     
    # replace all osillations for current user to dataframe        
    replace_base1(working_usage, oscillation_list)

time: 8min 57s


In [17]:
# count = 0
# for indexes, bases in oscillation_list:
#     if count % 50 == 0:
#         print(count)
#         print(working_usage.iloc[indexes[0]:indexes[1]+6,:]['loc'].value_counts())
#         # print(bases)
#         print("Before:", working_usage.iloc[indexes[0]:indexes[1]+6,:])
#     #https://stackoverflow.com/questions/47136436/python-pandas-convert-value-counts-output-to-dataframe
#     base_counts = working_usage.loc[indexes[0]:indexes[1]+6,:]['loc'].value_counts().rename_axis('loc').reset_index(name='counts')
#     # https://stackoverflow.com/questions/37841525/correct-way-to-set-value-on-a-slice-in-pandas
#     base_1 = base_counts.loc[base_counts['loc'] == bases[0], 'counts'].iloc[0]
#     base_2 = base_counts.loc[base_counts['loc'] == bases[1], 'counts'].iloc[0]
#     if base_2 > base_1:
#         working_usage.loc[indexes[0]:indexes[1]+6,:].loc[working_usage.loc[indexes[0]:indexes[1]+6,:]['loc'] == bases[0], 'loc'] = bases[1]
#         # replace base 1
#     else:
#         working_usage.loc[indexes[0]:indexes[1]+6,:].loc[working_usage.loc[indexes[0]:indexes[1]+6,:]['loc'] == bases[1],'loc'] = bases[0]
#         # replace base 2
#         # also repalce base 2 if equal since base 1 came first
#     if count % 50 == 0:
#         print("After:", working_usage.iloc[indexes[0]:indexes[1]+6,:])
#     count += 1

time: 263 µs


In [18]:
usage.iloc[4050241:4050259]

Unnamed: 0,uid,timestamp,loc,app_id,traffic
4050241,979,2016-04-21 07:44:42,3306,763,0.014
4050242,979,2016-04-21 07:44:42,9251,2,0.001
4050243,979,2016-04-21 07:44:42,9251,763,0.002
4050244,979,2016-04-21 07:44:43,3306,2,0.016
4050245,979,2016-04-21 07:44:43,9251,2,0.033
4050246,979,2016-04-21 07:44:44,9251,2,0.003
4050247,979,2016-04-21 07:44:47,9251,2,0.002
4050248,979,2016-04-21 07:44:48,3306,2,0.054
4050249,979,2016-04-21 07:44:48,9251,2,0.086
4050250,979,2016-04-21 07:44:49,3306,2,0.003


time: 48.7 ms


In [19]:
working_usage.loc[4050241:4050259]

Unnamed: 0,uid,timestamp,loc,app_id,traffic
4050241,979,2016-04-21 07:44:42,9251,763,0.014
4050242,979,2016-04-21 07:44:42,9251,2,0.001
4050243,979,2016-04-21 07:44:42,9251,763,0.002
4050244,979,2016-04-21 07:44:43,9251,2,0.016
4050245,979,2016-04-21 07:44:43,9251,2,0.033
4050246,979,2016-04-21 07:44:44,9251,2,0.003
4050247,979,2016-04-21 07:44:47,9251,2,0.002
4050248,979,2016-04-21 07:44:48,9251,2,0.054
4050249,979,2016-04-21 07:44:48,9251,2,0.086
4050250,979,2016-04-21 07:44:49,9251,2,0.003


time: 14.9 ms


In [32]:
working_usage.to_csv('rm_oscillated_data.txt', header=False, index=False, sep=',')

time: 21.1 s


# Handle the oscillatoin problem

## Hybrid Approach

In [20]:
# import warnings
# warnings.filterwarnings('ignore')

time: 351 µs


In [21]:
# # Function to align the Origin and Destination (O/D)
# def sort_movement_tuples(jump):
#     return sorted(list(jump))

# # Define a function that can output the frequency pair of a user's mobility pattern
# def show_frequent_pair(uid=888, threshold=5, jump_occurence=10):

#     user_uid = usage[usage['uid'] == str(uid)]

#     # Set the thresold of time gap allowed between movement
#     delta_threshold = timedelta(seconds=threshold)

#     # Create the columns of the next time and lcoation
#     user_uid['next_loc'] = user_uid['loc'].shift(-1).fillna(0).astype('int')
#     user_uid['next_timestamp'] = user_uid['timestamp'].shift(-1)

#     # Keep only movements
#     user_uid = user_uid[user_uid['loc'] != user_uid['next_loc']]

#     # Get the time gaps between movements
#     user_uid['time_delta'] = user_uid['next_timestamp'] - user_uid['timestamp']

#     # Get teleports and count the teleports
#     # Should be able to identify the return trips
#     teleports = user_uid[user_uid['time_delta'] < delta_threshold]
#     teleports['tele'] = list(zip(teleports['loc'], teleports['next_loc']))
#     # print(teleports['tele'].value_counts())

#     # Group and count the same jump
#     tele_counts = teleports['tele'].value_counts().reset_index()
#     tele_counts.columns = ['jump', 'count']

#     # Align the O/D of the jump to identify the pairs
#     tele_counts['jump'] = tele_counts['jump'].apply(lambda i: sort_movement_tuples(i))

#     # Get the jumps that has a reasonable number of occurence
#     tele_counts = tele_counts[tele_counts['count'] > jump_occurence]
#     tele_counts['jump'] = tuple(tele_counts['jump'])

#     # print the record for reference
#     frequent_pairs = tele_counts.groupby(['jump']).sum()
#     # print(frequent_pairs)

#     return frequent_pairs


time: 280 µs


In [22]:
# # Show the frequent pairs of user 13, gap less than 3 seconds, and occurence higher than 5 times (both directions count)
# freq_pair = show_frequent_pair(13,3,5)
# # freq_pair.head(20)

time: 548 µs


### Algorithm to approximate the baseID clusters

In [23]:
# # experiment = ['888','772','837']
# experiment = [772]

# base_cluster = []
# skipped_jump = []

# for user in experiment:
#     freq_pair = show_frequent_pair(user,1,5)

#     # Loop over the unique jump pattern
#     for i in freq_pair.index:

#         # Get the first and second baseID
#         loc1 = i[0]
#         loc2 = i[1]
        
#         # Start storing clusters
#         if len(base_cluster) == 0:
#             base_cluster.append(list(i)) # Add the first jump to the cluster

#         # Start expanding or creating new clusters
#         # Loop through each cluster
#         for pos in range(len(base_cluster)):

#             # if loc1 is in the cluster, we assume loc2 is a base station near the cluster. Vice versa
#             # Then we add both baseID to the cluster. Either one of them is already in there anway
#             if loc1 == base_cluster[pos][0]:
                
#                 # print(list(i))
#                 # We add the new baseID to the existing cluster
#                 base_cluster[pos] += list(i)
#                 # base_cluster[pos]  = set(a)
#                 new_cluster = 0 # Saying we do not need to create a new cluster
#                 break

#             # If one of the baseID is linked to the cluster, we flag up to create a new cluster
#             else:
#                 new_cluster = 1
                
#         # Save the jump later for the reason below
#         if new_cluster == 1:
#             base_cluster.append(list(i))
#             new_cluster = 0

# # Previous method
# # Issue happens a the end of the loop. A jump with two new baseID created a new cluster. Then the next jump has one baseID connected to previous cluster. In that case, the new cluster created in the previous jump should be included in the bigger cluster. Therefore, we skipped the jumps and process it later in the cell below


# # a = [sorted(set(list(x))) for x in base_cluster]
# print("We approximated {} clusters.".format(len(base_cluster)))

time: 339 µs


In [24]:
# base_mapper = {}
# for i in base_cluster:
#     base_mapper[i[0]] = sorted(list(set(i[1:])))

# # pprint(base_mapper)

time: 290 µs


In [25]:
# a = [set(list(x)) for x in base_cluster]

time: 722 µs


In [26]:
# # We count the occurence of baseID from all cluster
# # I assume the baseID will not repeat but it is not
# count_occurence = {}
# for combo in a:
#     for place in combo:
#         if place in count_occurence.keys():
#             count_occurence[place] += 1
#         else:
#             count_occurence[place] = 1

# # Show results in a dictionary
# from collections import Counter
# results = dict(Counter(list(count_occurence.values())))
# print(results)
# print()
# print("Some baseID appear in more than one clusters")
# print("Have to investigate why")

time: 462 µs


In [27]:
# [[k, v] for k, v in sorted(count_occurence.items(), key=lambda item: item[1], reverse=True)][:10]

time: 410 µs


## Speed Approach

In [28]:
# def reduce_noise(uid=888, threshold=300, show_graph=False):

#     user_uid = usage[usage['uid'] == str(uid)]
#     user_uid.reset_index(inplace=True, drop=True)
#     user_uid.drop(['app_id', 'traffic'], axis=1, inplace=True)

#     # Set the thresold of time gap allowed between movement
#     delta_threshold = timedelta(seconds=threshold)

#     # Create the columns of the previous time and location
#     user_uid['prev_loc'] = user_uid['loc'].shift(1).fillna(9999).astype('int')
#     user_uid['prev_timestamp'] = user_uid['timestamp'].shift(1)
#     user_uid['original'] = user_uid['loc']
#     # Get the time gaps between movements
#     user_uid['time_delta'] =  user_uid['timestamp'] - user_uid['prev_timestamp']


#     origin_user_uid = user_uid.copy()

#     before = user_uid['loc'].value_counts().shape[0]

#     for i in user_uid[user_uid['time_delta'] < timedelta(seconds=threshold)].index:
#         user_uid.iloc[i,2] = user_uid.iloc[i-1,2]

#     after = user_uid['loc'].value_counts().shape[0]


#     if show_graph == True:

#         print("="*50)
#         print("From {} to {} unique_base...".format(before, after))
#         print("="*50)
#         print()
        
#         clean_user_uid = user_uid.copy()

#         fig, axs = plt.subplots(2, figsize=(15,3))
#         fig.suptitle('Uid: {}'.format(uid))

#         # plt.figure(figsize=(15,3))
#         axs[0].scatter(origin_user_uid['timestamp'],origin_user_uid['loc'], alpha=0.5)
#         axs[1].scatter(clean_user_uid['timestamp'],clean_user_uid['loc'], alpha=0.5)
#         plt.show()
    
#     return int(before - after)

time: 1.68 ms


In [29]:
# sample_group = [10, 17, 888]
# threshold = 1200 # seconds

# reduce_result = []
# for u in sample_group:
#     reduce_result.append(reduce_noise(u, threshold, True)) # 
# print("Done")

time: 1.3 ms


In [30]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim

time: 298 µs


In [31]:
# usage.head(10)

time: 278 µs
