# Base model for baseline comparison

In [2]:
#%load_ext autotime

import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from heapq import nlargest
import warnings
import numpy as np
warnings.filterwarnings('ignore')
from scipy import stats
from tqdm.notebook import tqdm

pd.set_option("display.max_colwidth",200)
pd.set_option("display.max_columns",20)
pd.set_option('float_format', '{:.3f}'.format)

In [10]:
#import dataset
usage = pd.read_csv(r"C:\Users\Brian Santoso\CS48 - Predicting User Patterns From IoT Data\dataset\rm_oscillated_cleaned_data.txt", delimiter=',', \
    nrows=3911,\
    names=['uid','datetime','loc','app_id','traffic'], \
    dtype={'uid': int, 'datetime': str, 'loc': int, 'app_id': int, 'trafific': float})
usage['datetime'] = usage['datetime'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")) # Convert to datetime object
usage['day'] = usage['datetime'].apply(lambda x: x.day)
n_users = len(usage['uid'].unique())
print("Successfully loaded data")
print("-"*50)
print('Number of users', n_users)
print('-'*50)
print(usage.head())
print('-'*50)
print(usage['day'].unique())

Successfully loaded data
--------------------------------------------------
Number of users 1
--------------------------------------------------
   uid            datetime   loc  app_id  traffic  day
0    0 2016-04-21 08:42:21  8194     342    0.030   21
1    0 2016-04-21 08:44:12  8194       1    0.008   21
2    0 2016-04-21 08:44:47  8194     857    0.027   21
3    0 2016-04-21 08:44:48  8194     857    0.002   21
4    0 2016-04-21 08:44:49  8194      31    0.009   21
--------------------------------------------------
[21 22 23 24 25 26]


# Data preprocessing

# Data Preprocessing

From the DeepApp paper, they preprocess the data according to their definition. <br>
They use 30-min interval as a session. A window is 24 hours which consists of 48 intervals. <br>
They did not implement a dataloader. Rather they treat each window as a batch. (**Can be fine-tuned**)<br>
<br>
The preprocessed data should look like this: <br>

<br>data: {
    <br>user: 
        <br>'20-Apr': {
            <br>'tim' : [list of time in the window Shape(48,1)],
            <br>'loc' : [lsit of loc in the window Shape(48,1)],
            <br>'app': [list of multi-hot-code vector in the window. Shape(48, 2000)]    
        <br>},
        <br>'21-Apr': {
            <br>'tim' : [list of time in the window Shape(48,1)],
            <br>'loc' : [lsit of loc in the window Shape(48,1)],
            <br>'app': [list of multi-hot-code vector in the window. Shape(48, 2000)]
        <br>}, ........
    <br>},
    <br>user2: {
        <br>Same pattern........
    <br>}
<br>}
<br>
<br>
**Brief summary of the helper functions**
- generate_input returns a train set and a test set, adds more field like ptim, app_target, loc_target, uid, tim_o, loc_o, topk, etc.
- generate_queue returns a queue so it pops a window of a user everytime and feed to the model

<br>
<br>
In other words, if we use a dataloader instead. For each user, trainloader has 3 batches, Vall has 1. Each batch has 48 samples. If we want to use dataloader instead, we have to make sure each iteration of it contains the information of one user?? Or order does not matter at all.

In [11]:
print(np.zeros(48,1))

TypeError: Cannot interpret '1' as a data type

In [12]:
# Set the variable for interval span
span = 30
interval_span = str(span) + 'T'
n_intervals = int(60*24/span)
print('# of intervals', n_intervals)

# Floor the datetime then convert it to h:m:s
#  according to the interval span such that we can aggregate the requests in a session
usage['floored_time'] = usage['datetime'].apply(lambda x: pd.Timestamp.floor(x, freq=interval_span).time())

# Map the h:m:s to interval id e.g. 1, 2, 3, 4, 5
# Generate the intervals that will be matched to the timestamp of the data
intervals = pd.date_range('2020/1/1', freq=interval_span, periods=48)
intervals = [i.time() for i in intervals]

mapper = {}
for i, time in enumerate(intervals):
    mapper[time] = int(i)

# Map the id to the dataframe
usage['interval_id'] = usage['floored_time'].map(mapper)

# Drop the floored_time columns
usage = usage.drop(['floored_time', 'traffic', 'datetime'], axis=1)

# of intervals 48


In [13]:
# Multihot code the app_id
app_np = usage['app_id'].to_numpy()

# Create the empty array to hold the one-hot-code app
app_multihot = np.zeros([len(usage), 2000])

# Set the corresponding app index to 1
for i, app in enumerate(app_np):
    app_multihot[i, app-1] = 1 # INDEXING STARTS FROM ZERO

# Drop the app_id before doing numpy
usage = usage.drop('app_id', axis=1)

# Convert the dataframe into numpy array
usage_np = usage.to_numpy()
usage_np = np.concatenate([usage_np, app_multihot], 1) # add the uid, loc, day, session_id and app tgt

In [14]:
data = {}
users = []
visited_session = []
session = np.zeros([2,2002])

loop = tqdm(enumerate(usage_np), total = usage_np.shape[0])
for n, i in loop:

    u, loc, day, session_id, app = i[0], i[1], i[2], int(i[3]), i[4:]

    if n == 0:
        prev_session_id = session_id
        prev_u = u

    # Create a holding array for each user, padded with number of sessions per day
    if u not in users:
        users.append(u)
        data[u] = {20:np.zeros([48,2002]), 21:np.zeros([48,2002]), 22:np.zeros([48,2002]), \
             23:np.zeros([48,2002]), 24:np.zeros([48,2002]), 25:np.zeros([48,2002]), 26:np.zeros([48,2002])}

        # For tracking the pre-processing
        # if len(users) % 10 == 0:
        #     print('Processed {}/{} users'.format(len(users), n_users))
        
        loop.set_description(f"Processed {len(users)}/{n_users} users")
    
    # If the next request is still in the same interval
    if session_id == prev_session_id:
        request = np.expand_dims(np.concatenate([[session_id], [loc], app]), axis=0)
        session = np.concatenate([session,request] )
        prev_session_id = session_id

    elif session_id != prev_session_id:

        # print('User {} made {} requests in session {}'.format(int(u), len(session), session_id))
        visited_session.append(session_id)

        # Create a session
        s_loc = np.expand_dims(stats.mode(session[:, 1]).mode, axis=0)
        s_app = np.expand_dims(np.sum(session[:, 2:], axis=0), axis=0)
        s_sid = np.expand_dims([prev_session_id], axis=0)

        s = np.concatenate([s_sid, s_loc, s_app], axis=1)

        data[u][day][prev_session_id] = s # Set the interval of the day

        # Reset the session initiated by the new request in the sessino
        session = request = np.expand_dims(np.concatenate([[session_id], [loc], app]), axis=0)
        prev_session_id = session_id
        # break
print('Done with all users!')    

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3911.0), HTML(value='')))


Done with all users!


In [15]:
import collections
unique, counts = np.unique(data[0][21][17], return_counts = True)
dict(zip(unique, counts))

{0.0: 1990,
 1.0: 2,
 2.0: 1,
 3.0: 1,
 4.0: 1,
 5.0: 2,
 8.0: 1,
 12.0: 1,
 16.0: 1,
 17.0: 1,
 8194.0: 1}

In [16]:
data[0][21]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Hands on example (1 User)

### for user 0, from cleaned data, only 21,22,23,24,25,26th data is avaliable
### 21-25 will be train set, 26 will be test set

## From previous documentation: First 2 columns is time(session number in a day, e.g. session 0 is from 12:00 - 12:30) and location (base_id, mode:base_id with max count)
## All else columns are just representing 2000 different apps, 1/0 representing used or not

## Split Train and Test Dataset

In [17]:
##concat dataset, 1 day has 48 rows, therefore concat 5 days for training
user0_data_concat = np.concatenate((data[0][21], data[0][22], data[0][23], data[0][24], data[0][25]), axis = 0)
# put it in df for ez slicing
df_train = pd.DataFrame(data = user0_data_concat)
#remember, first 2 rows is feature, others are labels, we are doing a multi-label, mulitioutput classification
X_train = np.array(df_train.iloc[:,0:2])
y_train = np.array(df_train.iloc[:,2:])
print(X_train.shape)
print(y_train.shape)

(240, 2)
(240, 2000)


In [18]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001
0,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
1,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
2,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
3,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
4,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
236,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
237,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
238,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [19]:
#same for test set
df_test = pd.DataFrame(data = data[0][26])
X_test = np.array(df_test.iloc[:,0:2])
y_test = np.array(df_test.iloc[:,2:])
print(X_test.shape)
print(y_test.shape)

(48, 2)
(48, 2000)


## Model Fitting: user 0 (train: first 5 days, test: 6th day)

In [44]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


#### MLP Classifier

In [41]:
#just random picked one model for testing, should have a lot of models
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier ##/
# from sklearn.linear_model import LogisticRegression ##X
from sklearn.neural_network import MLPClassifier

clf_MLP = MultiOutputClassifier(MLPClassifier(random_state=1, max_iter=300)).fit(X_train, y_train)
pred = clf_MLP.predict(X_test)

In [45]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import zero_one_loss
from sklearn.metrics import hamming_loss
acc_list = []
zero_one_list = []
hamming_list = []

y_test = y_test.astype(np.int64)
pred = pred.astype(np.int64)
#compare and calculate matrics row by row, finally take the loss of the matrics list
for i in range(y_test.shape[0]):
    acc = accuracy_score(y_test[i],pred[i])
    zero_one = zero_one_loss(y_test[i],pred[i])
    hamming = hamming_loss(y_test[i],pred[i])

    acc_list.append(acc)
    zero_one_list.append(zero_one)
    hamming_list.append(hamming)

print("Accuracy: {:.5f}, Zero/One Loss: {:.5f}, Hamming Loss: {:.5f}".format(np.mean(acc_list), np.mean(zero_one_list), np.mean(hamming_list)))

Accuracy: 0.99843, Zero/One Loss: 0.00157, Hamming Loss: 0.00157


#### MLR Classifier

In [34]:
# try adding MLR 
from sklearn.linear_model import LogisticRegression

# define the model
#clf_MLR = MultiOutputClassifier(LogisticRegression(random_state=0).fit(X_train, y_train))
#pred = clf_MLR.predict(X_test)


#### Naive Bayes Classifier

In [37]:
# try adding NB Classifier
from sklearn.naive_bayes import GaussianNB

# define the model
clf_NB = MultiOutputClassifier(GaussianNB()).fit(X_train, y_train)
pred = clf_NB.predict(X_test)

In [38]:
acc_list = []
zero_one_list = []
hamming_list = []

y_test = y_test.astype(np.int64)
pred = pred.astype(np.int64)
#compare and calculate matrics row by row, finally take the loss of the matrics list
for i in range(y_test.shape[0]):
    acc = accuracy_score(y_test[i],pred[i])
    zero_one = zero_one_loss(y_test[i],pred[i])
    hamming = hamming_loss(y_test[i],pred[i])

    acc_list.append(acc)
    zero_one_list.append(zero_one)
    hamming_list.append(hamming)

print("Accuracy: {:.5f}, Zero/One Loss: {:.5f}, Hamming Loss: {:.5f}".format(np.mean(acc_list), np.mean(zero_one_list), np.mean(hamming_list)))

Accuracy: 0.99757, Zero/One Loss: 0.00243, Hamming Loss: 0.00243


#### KNN Classifier

In [39]:
from sklearn.neighbors import KNeighborsClassifier
clf_KNN = MultiOutputClassifier(KNeighborsClassifier()).fit(X_train, y_train)
pred = clf_KNN.predict(X_test)

In [40]:
acc_list = []
zero_one_list = []
hamming_list = []

y_test = y_test.astype(np.int64)
pred = pred.astype(np.int64)
#compare and calculate matrics row by row, finally take the loss of the matrics list
for i in range(y_test.shape[0]):
    acc = accuracy_score(y_test[i],pred[i])
    zero_one = zero_one_loss(y_test[i],pred[i])
    hamming = hamming_loss(y_test[i],pred[i])

    acc_list.append(acc)
    zero_one_list.append(zero_one)
    hamming_list.append(hamming)

print("Accuracy: {:.5f}, Zero/One Loss: {:.5f}, Hamming Loss: {:.5f}".format(np.mean(acc_list), np.mean(zero_one_list), np.mean(hamming_list)))

Accuracy: 0.99871, Zero/One Loss: 0.00129, Hamming Loss: 0.00129
