# Tensor flow and other basic stuff

In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pymongo import MongoClient

np.set_printoptions(precision=3, suppress=True)

def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]


def read_mongo(db="anystream", collection="statistics", query={}, host='mongo.adamkaliszan.pl', port=27017, username='anonymus', password='password', no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

def getResults(df, v=0, **kwargs):
    dfF = df.loc[df.system.map(lambda x: x['v']) == v]

    for key in kwargs:
        dfF = dfF.loc[dfF.system.map(lambda x: x['class'][key]) == kwargs[key]]

    return dfF

def getClasses(df, v = 1, **kwargs):            
    if v > 0:
        dfF = df.loc[df.system.map(lambda x: x['v']) == v]

    for key in kwargs:
        dfF = dfF.loc[dfF.system.map(lambda x: x['class'][key]) == kwargs[key]]

    return dfF.system.drop_duplicates().map(lambda x: x['class']).reset_index(drop=True)

def getStatistics(df, tr_class, v, params):
    dfF = df.loc[df.system.map(lambda x: (x['v'] == v and x['class'] == tr_class))]
    dfF = dfF.stat.drop_duplicates().map(lambda x: x['states']).reset_index()
    dfF = dfF.drop(columns=['index'])

    columns = []

    for name in params.values():
        for i in range(0, v+1):
            columns.append(f"{name}({i})_{v}")

    values = []

    for idx, item in dfF.iterrows():
        val_item = []
        for par in params:
            for i in range(0, v+1):
                val_item.append(item.stat[i][par])
        values.append(val_item)

    dfS = pd.DataFrame(data = values, columns = columns)
    return dfS


def calculateAvarages(series):
    result = series.mean(axis=0)
    return result.to_frame().T

In [3]:
df = read_mongo(query = {"stat.metadata.min_no_of_events_per_state" : { "$gte": 100 }})
print(f"df len = {len(df)}")
pd.set_option('display.max_colwidth', None)

colClasses = getClasses(df, 20)
print(f"Number of traffic classes: {len(colClasses)}")

df len = 6916
Number of traffic classes: 33


In [5]:
V = 20
colClasses = getClasses(df, V)
print(f"No of loaded classes: {len(colClasses)}")

No of loaded classes: 33


In [18]:
params = {"p": "p", "out_new" : "a", "out_end" : "s"}
dfFinalItems = []
for idx, trClass in enumerate(colClasses):
    print(f"{idx}/{len(colClasses)}: {trClass}")

    for par, name in params.items():
        avgStats = []
        for v in range (1, V+1):
            series = getStatistics(df, trClass, v, {par:name})
            avgStats.append(calculateAvarages(series))

    dfM = pd.concat(avgStats, axis=1).reset_index(drop=True)
    dfFinalItems.append(dfM)

dfFinal = pd.concat(dfFinalItems).reset_index()
print(dfFinal)

0/33: {'a': 10.0, 'mu': 1.0, 'arrival_stream_type': 'Gamma', 'arrival_e2d2': 1.0, 'service_stream_type': 'Poisson', 'service_e2d2': 1.0}
1/33: {'a': 10.5, 'mu': 1.0, 'arrival_stream_type': 'Gamma', 'arrival_e2d2': 1.0, 'service_stream_type': 'Poisson', 'service_e2d2': 1.0}
2/33: {'a': 11.0, 'mu': 1.0, 'arrival_stream_type': 'Gamma', 'arrival_e2d2': 1.0, 'service_stream_type': 'Poisson', 'service_e2d2': 1.0}
3/33: {'a': 11.5, 'mu': 1.0, 'arrival_stream_type': 'Gamma', 'arrival_e2d2': 1.0, 'service_stream_type': 'Poisson', 'service_e2d2': 1.0}
4/33: {'a': 12.0, 'mu': 1.0, 'arrival_stream_type': 'Gamma', 'arrival_e2d2': 1.0, 'service_stream_type': 'Poisson', 'service_e2d2': 1.0}
5/33: {'a': 12.5, 'mu': 1.0, 'arrival_stream_type': 'Gamma', 'arrival_e2d2': 1.0, 'service_stream_type': 'Poisson', 'service_e2d2': 1.0}
6/33: {'a': 13.0, 'mu': 1.0, 'arrival_stream_type': 'Gamma', 'arrival_e2d2': 1.0, 'service_stream_type': 'Poisson', 'service_e2d2': 1.0}
7/33: {'a': 13.5, 'mu': 1.0, 'arrival_str

  dfFinal = pd.concat(dfFinalItems).reset_index()


In [1]:
%config IPCompleter.greedy=True
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import sklearn
from sklearn import preprocessing
from tensorflow.keras.layers.experimental import preprocessing


print(f"Tensor Flow version: {tf.__version__}")


Exemplary item in data frame:
```json
{ 
    "system" : {
        "class" : { "a" : 8.0, "mu" : 1.0, "arrival_stream_type" : "Uniform", "arrival_e2d2" : 3.0, "service_stream_type" : "Poisson", "service_e2d2" : 1.0 },
        "v" : 1 
    },
    "stat" : { 
        "states" : [ 
            { "p" : 0.07650509235411221, "out_new" : 12.249978258197231, "out_end" : 0.0 },
            { "p" : 0.9234949076458878, "out_new" : 7.8230198889517375, "out_end" : 1.0148249981889534 } ],
        "v" : 1,
        "no_of_events" : 18458,
        "metadata" : { 
            "min_no_of_events_per_state" : 103, 
            "uuid" : { "$binary" : "KAEP7WQWRQq6ixFm2z+Zzg==", "$type" : "04" },
            "version" : "0.3.0" } 
    } 
}
```
where:
- system is the system description. Traffic class parameters and systems capacity
- stat are single simulation statistics. There are many series of such simulation that should be averaged. Is one series is significantly different, it should be dropped. 
  - each stat has V+1 states.
  - each state has its probability and passage intensities (out_new and out_end).
  - each stat has metadata:
    - min_no_of_events defines how log is the simulation. Increasing this value increases the simulation quality. During processing the stats, statistics with value below given threshold should be ignored and dropped
    - unique UUID tat can be used for discrimination of wrong statistics
    - version is used in case of bug detection. After fixing the bug, the version number is increased.

The main goal is to obtain out_new and out_end using machine learning. Base on distribution for system with V capacity we want to obtain parameters for systems with capacities 1, 2, ..., V-1


## Załadowanie danych z 60 symulacji

In [4]:
#occupancyDistribution = pd.read_csv("./results.csv", "\t")
occupancyDistribution = pd.read_csv("./resultsGamma_V10.csv", "\t")

print(occupancyDistribution.columns)
print(occupancyDistribution)
print(occupancyDistribution.describe())


TypeError: read_csv() takes 1 positional argument but 2 were given

In [None]:
V=10
SR=10
inputColNames  = [f"{l}({x})_{v}" for l in ['a', 's'] for v in range(1, V+1) if v%SR ==0 for x in range(0, v+1)]
labelColNames = [f"{l}({x})_{v}" for l in ['a'] for v in range(1, V+1) if v%SR !=0 for x in range(0, v+1)]

print("inputColNames:", inputColNames)
print("labelColNames:", labelColNames)

input = occupancyDistribution[inputColNames].copy().astype(np.float32)
label = occupancyDistribution[labelColNames].copy().astype(np.float32)

print(f"Input shape {input.shape}")
print(f"Label shape {label.shape}")

input.head()
#plt.plot(input.to_numpy()[0:17,0:6])

inputColNames: ['a(0)_10', 'a(1)_10', 'a(2)_10', 'a(3)_10', 'a(4)_10', 'a(5)_10', 'a(6)_10', 'a(7)_10', 'a(8)_10', 'a(9)_10', 'a(10)_10', 's(0)_10', 's(1)_10', 's(2)_10', 's(3)_10', 's(4)_10', 's(5)_10', 's(6)_10', 's(7)_10', 's(8)_10', 's(9)_10', 's(10)_10']
labelColNames: ['a(0)_1', 'a(1)_1', 'a(0)_2', 'a(1)_2', 'a(2)_2', 'a(0)_3', 'a(1)_3', 'a(2)_3', 'a(3)_3', 'a(0)_4', 'a(1)_4', 'a(2)_4', 'a(3)_4', 'a(4)_4', 'a(0)_5', 'a(1)_5', 'a(2)_5', 'a(3)_5', 'a(4)_5', 'a(5)_5', 'a(0)_6', 'a(1)_6', 'a(2)_6', 'a(3)_6', 'a(4)_6', 'a(5)_6', 'a(6)_6', 'a(0)_7', 'a(1)_7', 'a(2)_7', 'a(3)_7', 'a(4)_7', 'a(5)_7', 'a(6)_7', 'a(7)_7', 'a(0)_8', 'a(1)_8', 'a(2)_8', 'a(3)_8', 'a(4)_8', 'a(5)_8', 'a(6)_8', 'a(7)_8', 'a(8)_8', 'a(0)_9', 'a(1)_9', 'a(2)_9', 'a(3)_9', 'a(4)_9', 'a(5)_9', 'a(6)_9', 'a(7)_9', 'a(8)_9', 'a(9)_9']
Input shape (630, 22)
Label shape (630, 54)


Unnamed: 0,a(0)_10,a(1)_10,a(2)_10,a(3)_10,a(4)_10,a(5)_10,a(6)_10,a(7)_10,a(8)_10,a(9)_10,...,s(1)_10,s(2)_10,s(3)_10,s(4)_10,s(5)_10,s(6)_10,s(7)_10,s(8)_10,s(9)_10,s(10)_10
0,4.997979,4.998827,4.99775,4.999627,5.000669,4.996797,4.995011,4.999054,4.999118,4.997772,...,0.999995,1.99766,3.001118,3.999315,4.998825,6.001583,7.001068,8.006833,8.993415,10.009018
1,5.490233,5.499823,5.506721,5.497864,5.500509,5.49847,5.498195,5.500711,5.500023,5.498466,...,1.000545,1.998775,2.998182,3.999285,4.998265,6.001042,7.002124,7.994497,9.008581,10.000951
2,5.985474,5.993181,5.997361,5.997668,5.998716,6.001694,5.998439,6.001486,6.00176,5.994999,...,0.999076,1.996307,3.000478,4.000061,5.001675,6.001321,7.000861,7.997693,9.000064,9.999695
3,6.486227,6.502338,6.502121,6.500687,6.496743,6.498294,6.50126,6.501489,6.495543,6.500004,...,0.994825,1.997501,3.000973,3.999468,4.996759,6.003747,7.005655,7.998842,9.003159,10.000935
4,6.981313,7.012771,7.006184,7.007293,6.993352,7.00376,6.99985,7.002643,7.001285,7.003886,...,1.004527,2.004453,3.003385,3.999322,5.001534,5.996248,6.99858,8.001925,9.000112,10.000605


## Wykresy ##
Intensywność napływu zgłoszeń (Oś Y) w zależności od stanu (oś X).
Widać złą jakość symulacji

In [None]:
%matplotlib
LambdaColNames = [f"a({x})_{v}" for v in range(1, V+1) for x in range(0, v+1)]
lambdas = occupancyDistribution[LambdaColNames].copy().astype(np.float32)
#plt.plot(lambdas.to_numpy()[0, 0:100], [x for x in range(0, 100)])

fig = plt.figure()
rowOffset = 0
rowSkip = 3
for plotNo in range (1, 11):
    ax = fig.add_subplot(2, 5, plotNo)
    offset = 0
    for v in range (1, 11):
        ax.plot([x for x in range(0, v+1)], lambdas.to_numpy()[(plotNo-1) * rowSkip + rowOffset, offset:offset+v+1])
        offset = offset + v + 1


Using matplotlib backend: TkAgg


In [None]:
from sklearn.model_selection import train_test_split
input_train, input_test, label_train, label_test = train_test_split(input, label, test_size=0.33, random_state=42)

## Tworzenie modelu

In [None]:
model = tf.keras.Sequential([  
  tf.keras.layers.Dense(22),
  tf.keras.layers.Dense(540, activation='relu'),
  tf.keras.layers.Dense(2160, activation='relu'),
  tf.keras.layers.Dense(540, activation='relu'),
  tf.keras.layers.Dense(54),
])

loss_fn = tf.keras.losses.MeanSquaredError(reduction="auto", name="mean_squared_error")
model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])
model.fit(input_train.to_numpy(), label_train.to_numpy(), epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f586142fee0>

## Weryfikacja

In [None]:
model.evaluate(input_test, label_test, verbose=2)

3/3 - 0s - loss: 0.0855 - accuracy: 0.8023


[0.08552196621894836, 0.8023256063461304]

In [None]:
input_test

In [None]:
model = tf.keras.Sequential([  
  tf.keras.layers.Reshape([11,2]),
  tf.keras.layers.Conv1D(filters=16, kernel_size=3, padding='same', activation='relu', input_shape=(11,2)),
  tf.keras.layers.Dense(188),
  tf.keras.layers.Flatten(),

#  tf.keras.layers.Dropout(0.02),
#  tf.keras.layers.Dense(140),
  tf.keras.layers.Dense(662),
#  tf.keras.layers.Dropout(0.01),
  tf.keras.layers.Dense(54),
])

loss_fn = tf.keras.losses.MeanSquaredError(reduction="auto", name="mean_squared_error")
model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])
model.fit(input_train.to_numpy(), label_train.to_numpy(), epochs=30)