In [1]:
# Librerias

import logging
import pandas as pd
from tabulate import tabulate 

from trumania.core import circus, operations
from trumania.core.random_generators import ConstantDependentGenerator, SequencialGenerator, NumpyRandomGenerator
import trumania.core.util_functions as util_functions

import pandas as pd

In [7]:
# Creamos el circo (mundo en el que existirán todos los elementos del escenario)

rm_circus = circus.Circus(name="rm", 
                          master_seed=12345,
                          start=pd.Timestamp("25 Oct 2020 00:00"), # Fecha de inicio.
                          step_duration=pd.Timedelta("2h"))        # Cada hora, cada historia tiene posibilidad de ejecutarse

In [8]:
# Cargando imei_dataset.json y imsi_dataset.json

df_imei = pd.read_json('../rm/datasets/imei_dataset/imei_dataset.json', dtype = False)
df_imsi = pd.read_json('../rm/datasets/imsi_dataset/imsi_dataset.json', dtype = False)

# Generación de dataset Normal 

In [9]:
# Seleccionamos países previstos (España) 

df_imsi = df_imsi.loc[(df_imsi['mcc'] == '214')]
df_imsi

Unnamed: 0,mcc,mnc,msin,imsi
121200,214,23,4209664000,214234209664000
121201,214,23,9509610424,214239509610424
121202,214,23,6741145085,214236741145085
121203,214,23,3368808866,214233368808866
121204,214,23,1060750036,214231060750036
...,...,...,...,...
123395,214,04,9849761172,214049849761172
123396,214,04,9922596057,214049922596057
123397,214,04,3280220616,214043280220616
123398,214,04,8022966266,214048022966266


In [10]:
# Acomodamos dataset

df_imei.drop(['brand','model'], axis=1, inplace=True)    # Descartamos columnas no necesarias

# Borramos duplicados (drop duplicates no funciona)

duplicateRowsDF = df_imei[df_imei.duplicated(['imei'])]
print(f'Valores duplicados encontrados en df_imei:\n {duplicateRowsDF} \n')
df_imei.drop([54653, 155460], inplace=True)
duplicateRowsDF = df_imei[df_imei.duplicated(['imei'])]
print(f'Valores borrados. Comprobacion:\n {duplicateRowsDF} \n')

df_imei = df_imei.sample(frac=1).reset_index(drop=True)  # Aleatorizamos entradas
df_imei = df_imei.iloc[:2200,:]                        # Seleccionamos 4899 entradas

# Creamos entrada para valores no asignados
#na_imei = {'imei':['000000000000000']}
#df_na = pd.DataFrame(data=na_imei)

# Concatenamos dataframes de imei
#df_imei = df_imei.append(df_na).reset_index(drop=True)

print(f'{"-"*50}\n')

# Borramos duplicados (drop duplicates no funciona)

duplicateRowsDF = df_imsi[df_imsi.duplicated(['msin'])]
print(f'Valores duplicados encontrados en df_imsi:\n {duplicateRowsDF} \n')

df_imsi.drop(['mcc','mnc','msin'], axis=1, inplace=True) # Descartamos columnas no necesarias
df_imsi = df_imsi.sample(frac=1).reset_index(drop=True)  # Aleatorizamos entradas

display(df_imei)
display(df_imsi)

Valores duplicados encontrados en df_imei:
                    imei
54653   356526077030868
155460  356808074750163 

Valores borrados. Comprobacion:
 Empty DataFrame
Columns: [imei]
Index: [] 

--------------------------------------------------

Valores duplicados encontrados en df_imsi:
 Empty DataFrame
Columns: [mcc, mnc, msin, imsi]
Index: [] 



Unnamed: 0,imei
0,861510034501359
1,356347069397961
2,354131073129004
3,357214098334168
4,356864095059134
...,...
2195,864730048567546
2196,866199038579903
2197,863176034843049
2198,351971078082317


Unnamed: 0,imsi
0,214082434643450
1,214032235235647
2,214160699314114
3,214195649939081
4,214111150288222
...,...
2195,214031167339802
2196,214265620999420
2197,214155032970636
2198,214228947885234


In [11]:
# Generamos la población (actores del circo) y añadimos sus atributos. En este caso RM

id_gen = SequencialGenerator(prefix="rm_")

rm = rm_circus.create_population(name="rm", size=2200, ids_gen=id_gen)
rm.create_attribute("imei", init_values=df_imei['imei'].explode().tolist())
rm.create_attribute("imsi", init_values=df_imsi['imsi'].explode().tolist())

<trumania.core.attribute.Attribute at 0x7f582e85acc0>

In [12]:
rm.to_dataframe()

Unnamed: 0,imei,imsi
rm_0000000000,861510034501359,214082434643450
rm_0000000001,356347069397961,214032235235647
rm_0000000002,354131073129004,214160699314114
rm_0000000003,357214098334168,214195649939081
rm_0000000004,356864095059134,214111150288222
...,...,...
rm_0000002195,864730048567546,214031167339802
rm_0000002196,866199038579903,214265620999420
rm_0000002197,863176034843049,214155032970636
rm_0000002198,351971078082317,214228947885234


In [13]:
# Número de entradas en función del tiempo

from trumania.components.time_patterns.profilers import DefaultDailyTimerGenerator

story_timer_gen = DefaultDailyTimerGenerator(
    clock=rm_circus.clock, 
    seed=next(rm_circus.seeder))

In [14]:
# Creamos tres perfiles de tráfico

low_activity = story_timer_gen.activity(n=4, per=pd.Timedelta("1 day"))
med_activity = story_timer_gen.activity(n=6, per=pd.Timedelta("1 day"))
high_activity = story_timer_gen.activity(n=8, per=pd.Timedelta("1 day"))

In [15]:
# Asignamos a cada perfil de tráfico una probabilidad. Cada entrada del dataset tendrá un perf il de tráfico único.

activity_gen = NumpyRandomGenerator(
    method="choice", 
    a=[low_activity, med_activity, high_activity],
    p=[.2, .7, .1],
    seed=next(rm_circus.seeder))

In [16]:
# Creación de la historia. Se define población y su actividad en base la tiempo

rm_story = rm_circus.create_story(
    name="rm_story",
    initiating_population=rm_circus.populations["rm"],
    member_id_field="rm_id",
    
    timer_gen=story_timer_gen,
    activity_gen=activity_gen
)



In [None]:
# Generamos entrada rat y asignamos probabilidad.\

rat_gen = NumpyRandomGenerator(method="choice", 
                                   a=['2G','3G','4G','5G'], 
                                   p=[0.1, 0.3, 0.5, 0.1], 
                                   seed=next(rm_circus.seeder))

In [None]:
# Creamos relación rm -> rat

rat_rel = rm_circus.populations["rm"].create_relationship("rat")
rat_rel.add_relations(from_ids=rm.ids, to_ids=rat_gen.generate(size=rm.size))

In [None]:
# Definimos columnas del dataset que queremos generar.

rm_story.set_operations(
    rm_circus.clock.ops.timestamp(named_as="time"),
    rm_circus.populations["rm"].get_relationship("rat")
        .ops.select_one(from_field="rm_id", named_as="rat"),
    rm_circus.populations["rm"].ops.lookup(id_field="rm_id", select={"imei": "imei"}),
    rm_circus.populations["rm"].ops.lookup(id_field="rm_id", select={"imsi": "imsi"}),

    operations.FieldLogger(log_id="rm")
)

In [None]:
# Generamos dataset

rm_circus.run(
    duration=pd.Timedelta("15 days"),
    log_output_folder="../rm/datasets/trumania_dataset_v3/normal",
    delete_existing_logs=True
)

df = pd.read_csv("../rm/datasets/trumania_dataset_v3/normal/rm.csv", 
                 dtype = {'rm_id':str,'time':str,'rat':str,'imei':str,'imsi':str})

df = df.sort_values(by=['time']).reset_index(drop=True)
df.to_csv('../rm/datasets/trumania_dataset_v3/normal/rm_normal.csv', index=False)

display(df)

In [None]:
# Convertir columna en valores adecuados (cast). Comprobar tipos.

df['time'] = pd.to_datetime(df['time']) # Cast string a Timestamp
df.dtypes

In [None]:
# Comprobando dataset

print("Porcentaje de RAT en dataset:")
display(pd.DataFrame({'Porcentaje': df.groupby(('rat')).size() / len(df) * 100}))

# Comprobando si asignación no cambia al repetirse la entrada

print('Visualzando si una entrada no varía al repetirse:')
display(df[df['rm_id'] == 'rm_0000000306'].sort_values('time'))

In [None]:
df[df['rm_id'] == 'rm_0000000306'].groupby(by=["imei",'imsi']).count()

In [None]:
usage_per_user = df[["imsi", "rm_id"]].groupby("rm_id")["imsi"].count()
usage_per_user.plot(kind="hist")

In [None]:
time_profile = (
    df[["imei", "time"]]
    .groupby(by=df.time.dt.hour)["imei"]
    .count()
)
time_profile.plot()