In [2]:
import onnxruntime
import numpy as np

In [13]:
# download and uncompress
# reference: https://stackoverflow.com/questions/15352668/download-and-decompress-gzipped-file-in-memory

import gzip
import urllib.request

def download_file(file_url: str, out_file_name: str = None):
    """
    Download file from a given URL. If it is GZiped (has .gz), 
    it will unzip it. Return the downloaded file name.
    """
    if out_file_name is None:
        out_file_name = file_url[file_url.rfind("/")+1:]
    if out_file_name[-3:] == ".gz":
        out_file_name = out_file_name[:-3]

    response = urllib.request.urlopen(file_url)
    with open(out_file_name, 'wb') as outfile:
        outfile.write(gzip.decompress(response.read()))
        
    return out_file_name


In [15]:
download_file("http://jerrylia-lx-1.guest.corp.microsoft.com:8081/model.onnx.gz")

'model.onnx'

In [16]:
onnx_model_file = 'model.onnx'
session = onnxruntime.InferenceSession(onnx_model_file, None)

In [50]:
# Verify the input/output of the model
print("Input(s):")
for (i, inp) in enumerate(session.get_inputs()):
    print(f"Input #{i} name  :{inp.name}")
    print(f"Input #{i} shape :{inp.shape}")
    print(f"Input #{i} type  :{inp.type}")

print("Output:")
for (i, outp) in enumerate(session.get_outputs()):
    print(f"Input #{i} name  :{outp.name}")
    print(f"Input #{i} shape :{outp.shape}")
    print(f"Input #{i} type  :{outp.type}")

Input(s):
Input #0 name  :app
Input #0 shape :['N', 141]
Input #0 type  :tensor(float)
Input #1 name  :url
Input #1 shape :['N', 5]
Input #1 type  :tensor(float)
Output:
Input #0 name  :dense_5/Sigmoid:0
Input #0 shape :['N', 1]
Input #0 type  :tensor(float)


In [107]:
# Signal Processor code

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer

class UrlProcessor(object):
    
    num_train: int = 1000000
    dict_file: str = "URL.csv"
    MAX_NB_WORDS = 1e6
    max_token_len: int = 5
    
    @staticmethod
    def get_tokenizer():
        tokenizer_nltk = RegexpTokenizer(r'[a-zA-Z]+')
        stop_words = set(stopwords.words('english'))
        stop_words.update(['http', 'https', 'www', 'com', 'html', 'org', 'ru', 'jp', 'uk', 'ca', '//'])
        return tokenizer_nltk, stop_words
    
    def __init__(self):
        self.tokenizer_nltk, self.stop_words = UrlProcessor.get_tokenizer()
        self.tokenzier_keras = self.cache_url_dict()

    def tokenize(self, raw_url: str):
        url_tokens = self.tokenizer_nltk.tokenize(raw_url)
        url_tokens_clean = [word for word in url_tokens if word not in self.stop_words]
        return url_tokens_clean

    def cache_url_dict(self):
        url_df = pd.read_csv(self.dict_file, header=None)
        url_df.columns = ['index', 'url', 'category']
        url_df.dropna(inplace=True)
        url_train_df = url_df.sample(n=self.num_train, random_state=1)
        raw_docs_train = url_df['url'].apply(lambda u: self.tokenize(u))
        # NOTE: why 2 passes at all?
        processed_docs_train = []
        processed_docs_test = []
        for doc in raw_docs_train:
            tokens = self.tokenizer_nltk.tokenize(" ".join(doc))
            filtered = [word for word in tokens if word not in self.stop_words]
            processed_docs_train.append(" ".join(filtered))
        tokenizer_keras = Tokenizer(num_words=self.MAX_NB_WORDS, lower=True, char_level=False)
        tokenizer_keras.fit_on_texts(processed_docs_train + processed_docs_test)
        word_index = tokenizer_keras.word_index
        print("dictionary size: ", len(word_index))
        return tokenizer_keras
        
    def process_url(self, raw_url: str):
        url_tokens_clean = self.tokenize(raw_url)
        return self.tokenzier_keras.texts_to_sequences([url_tokens_clean])[0][0:self.max_token_len]



In [109]:
%%time
urlProc = UrlProcessor()

dictionary size:  1068970
Wall time: 34.2 s


In [110]:
urlProc.process_url("https://microsoft.visualstudio.com/OSGData/_git/enterprise.mhhd?path=%2Fthebox%2Fprototypes%2Fsessionattrpred%2Fmultimodal_dnn_baseline.ipynb&version=GBdev")

[498, 543298, 17602, 3123, 277]

In [95]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

class AppProcessor(object):
    
    num_train: int = 1000000
    dict_file: str = "AppResult.csv"
        
    def __init__(self):
        self.encoder = self.cache_app_transform()
    
    def cache_app_transform(self):
        
        app_session_df = pd.read_csv(self.dict_file)
        app_session_df.dropna(inplace=True)

        for i in range (10):
            app_session_df = pd.concat([app_session_df,app_session_df], axis=0)

        # NOTE: seed?
        app_train_df = app_session_df.sample(n=self.num_train, random_state=1)
        #print(app_train_df)
        enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
        enc.fit(app_train_df['Process'].values.reshape(-1,1))

        print(f"Encoder created with category count: {enc.categories_[0].shape}")
        return enc

    def process_app(self, app_name: str):
        return self.encoder.transform(np.array([app_name]).reshape(-1,1))

In [96]:
%%time
appProc = AppProcessor()

Encoder created with category count: (141,)
Wall time: 506 ms


In [140]:
appProc.process_app("chrome.exe")

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [54]:
input_app_name = session.get_inputs()[0].name
input_url_name = session.get_inputs()[1].name
output_name = session.get_outputs()[0].name

In [141]:
test_app = "chrome.exe"
test_url = "https://microsoft.visualstudio.com/OSGData/_git/enterprise.mhhd?path=%2Fthebox%2Fprototypes%2Fsessionattrpred%2Fmultimodal_dnn_baseline.ipynb&version=GBdev"
test_app_in = np.array(appProc.process_app(test_app)).astype(np.float32).reshape(1, -1)
test_url_in = np.array([urlProc.process_url(test_url)]).astype(np.float32).reshape(1, -1)

In [132]:
print(f"test_app shape: {test_app_in.shape}")
print(f"test_url shape: {test_url_in.shape}")

test_app shape: (1, 141)
test_url shape: (1, 5)


In [133]:
test_app_in.dtype

dtype('float32')

In [134]:
%%time
result = session.run(
    [output_name], 
    {input_app_name: test_app_in, 
     input_url_name: test_url_in}
)

Wall time: 192 ms


In [142]:
score_threshold = 0.4
scorer = lambda x: 1 if x > score_threshold else 0
labels = { 1: "work", 0: "play"}
labels[scorer(result[0])]

'play'