# Tensor flow and other basic stuff

Exemplary item in data frame:
```json
{ 
    "system" : {
        "class" : { "a" : 8.0, "mu" : 1.0, "arrival_stream_type" : "Uniform", "arrival_e2d2" : 3.0, "service_stream_type" : "Poisson", "service_e2d2" : 1.0 },
        "v" : 1 
    },
    "stat" : { 
        "states" : [ 
            { "p" : 0.07650509235411221, "out_new" : 12.249978258197231, "out_end" : 0.0 },
            { "p" : 0.9234949076458878, "out_new" : 7.8230198889517375, "out_end" : 1.0148249981889534 } ],
        "v" : 1,
        "no_of_events" : 18458,
        "metadata" : { 
            "min_no_of_events_per_state" : 103, 
            "uuid" : { "$binary" : "KAEP7WQWRQq6ixFm2z+Zzg==", "$type" : "04" },
            "version" : "0.3.0" } 
    } 
}
```
where:
- system is the system description. Traffic class parameters and systems capacity
- stat are single simulation statistics. There are many series of such simulation that should be averaged. Is one series is significantly different, it should be dropped. 
  - each stat has V+1 states.
  - each state has its probability and passage intensities (out_new and out_end).
  - each stat has metadata:
    - min_no_of_events defines how log is the simulation. Increasing this value increases the simulation quality. During processing the stats, statistics with value below given threshold should be ignored and dropped
    - unique UUID tat can be used for discrimination of wrong statistics
    - version is used in case of bug detection. After fixing the bug, the version number is increased.

The main goal is to obtain out_new and out_end using machine learning. Base on distribution for system with V capacity we want to obtain parameters for systems with capacities 1, 2, ..., V-1


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pymongo import MongoClient
import math

np.set_printoptions(precision=3, suppress=True)

def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]


def read_mongo(db="anystream", collection="statistics", query={}, host='mongo.adamkaliszan.pl', port=27017, username='anonymus', password='password', no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

def getResults(df, v=0, **kwargs):
    dfF = df.loc[df.system.map(lambda x: x['v']) == v]

    for key in kwargs:
        dfF = dfF.loc[dfF.system.map(lambda x: x['class'][key]) == kwargs[key]]

    return dfF

def getClasses(df, v = 1, **kwargs):            
    if v > 0:
        dfF = df.loc[df.system.map(lambda x: x['v']) == v]

    for key in kwargs:
        dfF = dfF.loc[dfF.system.map(lambda x: x['class'][key]) == kwargs[key]]

    return dfF.system.drop_duplicates().map(lambda x: x['class']).reset_index(drop=True).to_frame()

def getStatistics(df, tr_class, v, params):
    dfF = df.loc[df.system.map(lambda x: (x['v'] == v and x['class'] == tr_class))]
    dfF = dfF.stat.drop_duplicates().map(lambda x: x['states']).reset_index()
    dfF = dfF.drop(columns=['index'])

    columns = []

    for name in params.values():
        for i in range(0, v+1):
            columns.append(f"{name}({i})_{v}")

    values = []

    for idx, item in dfF.iterrows():
        val_item = []
        for par in params:
            for i in range(0, v+1):
                val_item.append(item.stat[i][par])
        values.append(val_item)

    dfS = pd.DataFrame(data = values, columns = columns)
    return dfS


def calculateAvarages(series):
    result = series.mean(axis=0)
    return result.to_frame().T

def dropClassesExact(dfCl, **kwargs):
    dfRes = dfCl
    for key, value in kwargs.items():
        dfRes = dfCl.loc[dfRes.system.map(lambda x: x[key] != value)]
    return dfRes.reset_index(drop=True)

def dropClassesLess(dfCl, **kwargs):
    dfRes = dfCl
    for key, value in kwargs.items():
        dfRes = dfRes.loc[dfRes.system.map(lambda x: x[key] < value)]
    return dfRes.reset_index(drop=True)

def dropClassesGreater(dfCl, **kwargs):
    dfRes = dfCl
    for key, value in kwargs.items():
        dfRes = dfRes.loc[dfRes.system.map(lambda x: x[key] > value)]
    return dfRes.reset_index(drop=True)

def prepareResults(V, dfClasses, df):
    params = {"p": "p", "out_new" : "a", "out_end" : "s"}
    ClColumns= ["a", "arrival_stream_type", "arrival_e2d2", "service_stream_type", "service_e2d2"]
    dfFinalItems = []
    for idx, trClass in enumerate(dfClasses.squeeze()):
        clData = [[trClass["a"],
                trClass["arrival_stream_type"],
                trClass["arrival_e2d2"],
                trClass["service_stream_type"],
                trClass["service_e2d2"]]]
        dfClass = pd.DataFrame(columns=ClColumns, data = clData)
        avgStats = [dfClass]
        for par, name in params.items():
            for v in range (1, V+1):
                series = getStatistics(df, trClass, v, {par:name})
                if len(series) < 5:
                    print(f"skipping class {trClass}, not enough series (only {len(series)}) for system with capacity {v}")
                    break;
                avgStats.append(calculateAvarages(series))
            else:
                continue
            break
        else:
            dfM = pd.concat(avgStats, axis=1).reset_index(drop=True)
            dfFinalItems.append(dfM)

    dfFinal = pd.concat(dfFinalItems).reset_index(drop=True)
    return dfFinal

%matplotlib
def plotDf(V, df, NoOfFig = 15, PlotsPerRow = 6):
    LambdaColNames = [f"a({x})_{v}" for v in range(1, V+1) for x in range(0, v+1)]
    lambdas = df[LambdaColNames].copy().astype(np.float32)
    #plt.plot(lambdas.to_numpy()[0, 0:100], [x for x in range(0, 100)])

    fig = plt.figure()
    rowOffset = 0
    rowSkip = 1

    if len(df) < NoOfFig:
        NoOfFig = len(df)

    TotalPlotsRow = math.ceil(NoOfFig/PlotsPerRow)

    for plotNo in range (1, NoOfFig + 1):
        ax = fig.add_subplot(TotalPlotsRow, PlotsPerRow, plotNo)
        offset = 0
        for v in range (1, V+1):
            ax.plot([x for x in range(0, v+1)], lambdas.to_numpy()[(plotNo-1) * rowSkip + rowOffset, offset:offset+v+1])
            offset = offset + v + 1


### Loading Row data and training data preparation


In [None]:
df = read_mongo(query = {"stat.metadata.min_no_of_events_per_state" : { "$gte": 100 }})
print(f"df len = {len(df)}")

V = 20
print(f"Loading available traffic classes available in system V={V}")
colClasses = getClasses(df, V)
print(f"{len(colClasses)} traffic classes is available")
print("Dropping classes with arrival E²ð² = 1 (gamma distribution, where E²= ð² -> lambda distribution)")
colClasses = dropClassesExact(colClasses, arrival_e2d2 = 1)
print(f"{len(colClasses)} traffic classes is available")

print(f"Preparation of final statistics (for system with capacity V=1, ..., {V})")
dfFinal = prepareResults(V, colClasses, df)
print(f"{len(dfFinal)} systems is available")

In [None]:
plotDf(V, dfFinal)

### Tensor Flow Part

In [None]:
%config IPCompleter.greedy=True
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import sklearn
from sklearn import preprocessing
from tensorflow.keras.layers.experimental import preprocessing


print(f"Tensor Flow version: {tf.__version__}")


## Input/Output and Training/Verification data selection

In [None]:
def getInputColNames(V, SR = 0, InputPar = ['a', 's']):
    if SR == 0: SR = V
    return [f"{l}({x})_{v}" for l in InputPar for v in range(1, V+1) if v%SR == 0 for x in range(0, v+1)]

def getLabelColNames(V, SR = 0, LabelPar = ['a']):
    if SR == 0: SR = V
    return [f"{l}({x})_{v}" for l in LabelPar for v in range(1, V) if v%SR !=0 for x in range(0, v+1)]

def getSubLabelColNames(V, LabelPar = ['a']):
    return [f"{l}({x})_{V}" for l in LabelPar for x in range(0, V+1)]


In [None]:
inputColNames = getInputColNames(V)
labelColNames = getLabelColNames(V)
#print("inputColNames:", inputColNames)
#print("labelColNames:", labelColNames)

input = dfFinal[inputColNames].copy().astype(np.float32)
label = dfFinal[labelColNames].copy().astype(np.float32)

print(f"Input shape {input.shape}")
print(f"Label shape {label.shape}")

input.head()
#plt.plot(input.to_numpy()[0:17,0:6])

from sklearn.model_selection import train_test_split
input_train, input_test, label_train, label_test = train_test_split(input, label, test_size=0.33, random_state=42)

## Tensorflow model

In [None]:
def createSubsetModel(dfTraining, V, functionsCalculateLen, SR = 0):
    if SR == 0: SR = V
    inputColNames  = getInputColNames(V, SR)
    InpLayer = tf.keras.layers.Input(shape=(len(inputColNames),),name="input")
    input_train = dfTraining[inputColNames].copy().astype(np.float32)

    LvlLayers = []
    LvlLayers.append([])
    for ln in range(0, len(functionsCalculateLen)):
        LvlLayers.append([])
    LvlLayers.append([])
    
    for vo in range(1,V):
        #labelColNames  = [f"{l}({x})_{vo}" for l in ['a'] for x in range(0, vo+1)]
        labelColNames = getSubLabelColNames(vo)
        print(f"v = {vo}: {labelColNames}")
        label_train = dfTraining[labelColNames].copy().astype(np.float32)



        LvlLayers[0].append(tf.keras.layers.Dense(len(inputColNames), name=f"V_{vo}.layer_1")(InpLayer))
        
        for idx, function in enumerate(functionsCalculateLen):
            LvlLayers[idx+1].append(tf.keras.layers.Dense(function(vo), name=f"V_{vo}.layer_{idx+2}")(LvlLayers[idx][vo-1]))

        LvlLayers[-1].append(tf.keras.layers.Dense(vo+1, name=f"V_{vo}.layer_{len(functionsCalculateLen) + 2}")(LvlLayers[-2][vo-1]))

        tmpModel = tf.keras.Model(inputs=InpLayer,outputs=LvlLayers[-1][vo-1])
        loss_fn = tf.keras.losses.MeanSquaredError(reduction="auto", name="mean_squared_error")
        optimizer = tf.keras.optimizers.Adam()#learning_rate=0.0002)
        tmpModel.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])
        tmpModel.fit(input_train.to_numpy(), label_train.to_numpy(), epochs=20, batch_size=1, verbose=1)

    OutLayer = tf.keras.layers.Concatenate(name="Output")(LvlLayers[-1]) if len(LvlLayers[-1]) > 0 else LvlLayers[-1]
    model = tf.keras.Model(inputs=InpLayer,outputs=OutLayer)
    loss_fn = tf.keras.losses.MeanSquaredError(reduction="auto", name="mean_squared_error")
    model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss=loss_fn, metrics=['accuracy'])

    return model

def createModel(dfTraining, V, SR = 0):
    inputColNames  = getInputColNames(V, SR)
    labelColNames = getLabelColNames(V, SR)
    input_train = dfTraining[inputColNames].copy().astype(np.float32)
    label_train = dfTraining[labelColNames].copy().astype(np.float32)
    
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(input.shape[1]),
        tf.keras.layers.Dense(540, activation='relu'),
        tf.keras.layers.Dense(2160, activation='relu'),
        tf.keras.layers.Dense(540, activation='relu'),
        tf.keras.layers.Dense(label.shape[1]),
        ])

    loss_fn = tf.keras.losses.MeanSquaredError(reduction="auto", name="mean_squared_error")
    model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])
    model.fit(input_train.to_numpy(), label_train.to_numpy(), epochs=30)
    return model

def verifyModel(df, model, V):
    inputColNames = inputColNames  = getInputColNames(V)
    labelColNames = getLabelColNames(V)

    input = df[inputColNames].copy().astype(np.float32)
    label = df[labelColNames].copy().astype(np.float32)
    model.evaluate(input, label, verbose=2)
    pr_label_data = model.predict(input)
    pr_label = pd.DataFrame(columns = labelColNames, data = pr_label_data, index=input.index)
    print(f"input type {type(input)}, pr_label type {type(pr_label)}")
    dfAprox = pd.concat([input, pr_label], axis=1)
    plotDf(V, dfAprox, 10, 5)
    

#def SaveAllModels(V, distrName, startRow):
#    occDistr = LoadData(distrName, V, startRow)
#    for v in range(2,V+1):
#        result = CreateModel(v, v, occDistr)
#        result.save(f"./trained_models/{distrName}/gamma/model_all_points")

## Verification

Preparation of training and verification data.

In [None]:
dfTraining = dfFinal.sample(frac = 0.75);
dfVerification = dfFinal.drop(dfTraining.index)


Variant with single model

In [None]:
model = createModel(dfTraining, V)
verifyModel(dfVerification, model, V)

Variant with many models

In [None]:
functionsCalculateLen = [
    lambda x: x*2,
    lambda x: x*15,
    lambda x: x*3,
]
model = createSubsetModel(dfTraining, V, functionsCalculateLen)

verifyModel(dfVerification, model, V)

Display Model graph

In [None]:
tf.keras.utils.plot_model(model)