In [1]:
# Librerias

import logging
import pandas as pd
from tabulate import tabulate 

from trumania.core import circus, operations
from trumania.core.random_generators import ConstantDependentGenerator, SequencialGenerator, NumpyRandomGenerator
import trumania.core.util_functions as util_functions

import pandas as pd

In [2]:
# Creamos el circo (mundo en el que existirán todos los elementos del escenario)

rm_circus = circus.Circus(name="rm", 
                          master_seed=12345,
                          start=pd.Timestamp("25 Oct 2020 00:00"), # Fecha de inicio.
                          step_duration=pd.Timedelta("2h"))        # Cada hora, cada historia tiene posibilidad de ejecutarse

In [3]:
# Cargando imei_dataset.json y imsi_dataset.json

df_imei = pd.read_json('../rm/datasets/imei_dataset/imei_dataset.json', dtype = False)
df_imsi = pd.read_json('../rm/datasets/imsi_dataset/imsi_dataset.json', dtype = False)

# Generación de dataset Normal 

In [4]:
# Seleccionamos países previstos (España) 

df_imsi = df_imsi.loc[(df_imsi['mcc'] == '214')]
df_imsi

Unnamed: 0,mcc,mnc,msin,imsi
121200,214,23,4209664000,214234209664000
121201,214,23,9509610424,214239509610424
121202,214,23,6741145085,214236741145085
121203,214,23,3368808866,214233368808866
121204,214,23,1060750036,214231060750036
...,...,...,...,...
123395,214,04,9849761172,214049849761172
123396,214,04,9922596057,214049922596057
123397,214,04,3280220616,214043280220616
123398,214,04,8022966266,214048022966266


In [5]:
# Acomodamos dataset

df_imei.drop(['brand','model'], axis=1, inplace=True)    # Descartamos columnas no necesarias

# Borramos duplicados (drop duplicates no funciona)

duplicateRowsDF = df_imei[df_imei.duplicated(['imei'])]
print(f'Valores duplicados encontrados en df_imei:\n {duplicateRowsDF} \n')
df_imei.drop([54653, 155460], inplace=True)
duplicateRowsDF = df_imei[df_imei.duplicated(['imei'])]
print(f'Valores borrados. Comprobacion:\n {duplicateRowsDF} \n')

df_imei = df_imei.sample(frac=1).reset_index(drop=True)  # Aleatorizamos entradas
df_imei = df_imei.iloc[:2199,:]                        # Seleccionamos 4899 entradas

# Creamos entrada para valores no asignados
na_imei = {'imei':['000000000000000']}
df_na = pd.DataFrame(data=na_imei)

# Concatenamos dataframes de imei
df_imei = df_imei.append(df_na).reset_index(drop=True)

print(f'{"-"*50}\n')

# Borramos duplicados (drop duplicates no funciona)

duplicateRowsDF = df_imsi[df_imsi.duplicated(['msin'])]
print(f'Valores duplicados encontrados en df_imsi:\n {duplicateRowsDF} \n')

df_imsi.drop(['mcc','mnc','msin'], axis=1, inplace=True) # Descartamos columnas no necesarias
df_imsi = df_imsi.sample(frac=1).reset_index(drop=True)  # Aleatorizamos entradas

display(df_imei)
display(df_imsi)

Valores duplicados encontrados en df_imei:
                    imei
54653   356526077030868
155460  356808074750163 

Valores borrados. Comprobacion:
 Empty DataFrame
Columns: [imei]
Index: [] 

--------------------------------------------------

Valores duplicados encontrados en df_imsi:
 Empty DataFrame
Columns: [mcc, mnc, msin, imsi]
Index: [] 



Unnamed: 0,imei
0,356980064737540
1,864323049291306
2,358335076905866
3,358144098634292
4,864592041505760
...,...
2195,355307082843012
2196,353304079536024
2197,358561085276995
2198,353293087689141


Unnamed: 0,imsi
0,214046646231352
1,214098146792037
2,214083951739681
3,214053348078565
4,214045631428401
...,...
2195,214185093314768
2196,214270580418886
2197,214231999350202
2198,214150605465484


In [6]:
# Generamos la población (actores del circo) y añadimos sus atributos. En este caso RM

id_gen = SequencialGenerator(prefix="rm_")

rm = rm_circus.create_population(name="rm", size=2200, ids_gen=id_gen)
rm.create_attribute("imei", init_values=df_imei['imei'].explode().tolist())
rm.create_attribute("imsi", init_values=df_imsi['imsi'].explode().tolist())

<trumania.core.attribute.Attribute at 0x7f5850a27d30>

In [7]:
rm.to_dataframe()

Unnamed: 0,imei,imsi
rm_0000000000,356980064737540,214046646231352
rm_0000000001,864323049291306,214098146792037
rm_0000000002,358335076905866,214083951739681
rm_0000000003,358144098634292,214053348078565
rm_0000000004,864592041505760,214045631428401
...,...,...
rm_0000002195,355307082843012,214185093314768
rm_0000002196,353304079536024,214270580418886
rm_0000002197,358561085276995,214231999350202
rm_0000002198,353293087689141,214150605465484


In [8]:
# Número de entradas en función del tiempo

from trumania.components.time_patterns.profilers import DefaultDailyTimerGenerator

story_timer_gen = DefaultDailyTimerGenerator(
    clock=rm_circus.clock, 
    seed=next(rm_circus.seeder))

In [9]:
# Creamos tres permiles de tráfico

low_activity = story_timer_gen.activity(n=4, per=pd.Timedelta("1 day"))
med_activity = story_timer_gen.activity(n=6, per=pd.Timedelta("1 day"))
high_activity = story_timer_gen.activity(n=8, per=pd.Timedelta("1 day"))

In [10]:
# Asignamos a cada permil de tráfico una probabilidad. Cada entrada del dataset tendrá un permil de tráfico único.

activity_gen = NumpyRandomGenerator(
    method="choice", 
    a=[low_activity, med_activity, high_activity],
    p=[.2, .7, .1],
    seed=next(rm_circus.seeder))

In [11]:
# Creación de la historia. Se define población y su actividad en base la tiempo

rm_story = rm_circus.create_story(
    name="rm_story",
    initiating_population=rm_circus.populations["rm"],
    member_id_field="rm_id",
    
    timer_gen=story_timer_gen,
    activity_gen=activity_gen
)



In [12]:
# Generamos entrada rat y asignamos probabilidad.\

rat_gen = NumpyRandomGenerator(method="choice", 
                                   a=['2G','3G','4G','5G'], 
                                   p=[0.1, 0.3, 0.5, 0.1], 
                                   seed=next(rm_circus.seeder))

In [13]:
# Creamos relación rm -> rat

rat_rel = rm_circus.populations["rm"].create_relationship("rat")
rat_rel.add_relations(from_ids=rm.ids, to_ids=rat_gen.generate(size=rm.size))

In [14]:
# Definimos columnas del dataset que queremos generar.

rm_story.set_operations(
    rm_circus.clock.ops.timestamp(named_as="time"),
    rm_circus.populations["rm"].get_relationship("rat")
        .ops.select_one(from_field="rm_id", named_as="rat"),
    rm_circus.populations["rm"].ops.lookup(id_field="rm_id", select={"imei": "imei"}),
    rm_circus.populations["rm"].ops.lookup(id_field="rm_id", select={"imsi": "imsi"}),

    operations.FieldLogger(log_id="rm")
)

In [16]:
# Generamos dataset

rm_circus.run(
    duration=pd.Timedelta("15 days"),
    log_output_folder="../rm/datasets/trumania_dataset_v3/rm_normal",
    delete_existing_logs=True
)

df = pd.read_csv("../rm/datasets/trumania_dataset_v3/normal/rm.csv", 
                 dtype = {'rm_id':str,'time':str,'rat':str,'imei':str,'imsi':str})

df = df.sort_values(by=['time']).reset_index(drop=True)
df.to_csv('../rm/datasets/trumania_dataset_v3/normal/rm_normal.csv', index=False)

display(df)

FileNotFoundError: [Errno 2] No such file or directory: '../rm/datasets/trumania_dataset_v3/normal/rm.csv'

In [None]:
# Convertir columna en valores adecuados (cast). Comprobar tipos.

df['TIME'] = pd.to_datetime(df['TIME']) # Cast string a Timestamp
df.dtypes

In [None]:
# Comprobando dataset

print("Porcentaje de RAT en dataset:")
display(pd.DataFrame({'Porcentaje': df.groupby(('RAT')).size() / len(df) * 100}))

# Comprobando si asignación no cambia al repetirse la entrada

print('Visualzando si una entrada no varía al repetirse:')
display(df[df['rm_ID'] == 'rm_0000000306'].sort_values('TIME'))

In [None]:
df[df['rm_ID'] == 'rm_0000000306'].groupby(by=["IMEI",'IMSI']).count()

In [None]:
usage_per_user = df[["IMSI", "rm_ID"]].groupby("rm_ID")["IMSI"].count()
usage_per_user.plot(kind="hist")

In [None]:
time_profile = (
    df[["IMEI", "TIME"]]
    .groupby(by=df.TIME.dt.hour)["IMEI"]
    .count()
)
time_profile.plot()

# Generación de dataset Anómalo 

In [None]:
# Creamos el circo (mundo en el que existirán todos los elementos del escenario)

rm_circus = circus.Circus(name="rm_anom", 
                          master_seed=12345,
                          start=pd.Timestamp("25 Oct 2020 00:00"), # Fecha de inicio.
                          step_duration=pd.Timedelta("2h"))        # Cada hora, cada historia tiene posibilidad de ejecutarse

In [None]:
# Cargando imei_dataset.json y imsi_dataset.json

df_imei = pd.read_json('datasets/imei_dataset/imei_dataset.json', dtype = False)
df_imsi = pd.read_json('datasets/imsi_dataset/imsi_dataset.json', dtype = False)

In [None]:
# Seleccionamos países no previstos (Irak, Mongolia, Chad, Somalia, Nigeria, Afganistan, Corea del Norte, Mozambique, Yemen) 

df_imsi = df_imsi.loc[(df_imsi['mcc'] == '418') | (df_imsi['mcc'] == '428') | (df_imsi['mcc'] == '622')
           |(df_imsi['mcc'] == '637') | (df_imsi['mcc'] == '621') | (df_imsi['mcc'] == '412')
           |(df_imsi['mcc'] == '467') | (df_imsi['mcc'] == '643') | (df_imsi['mcc'] == '421')]
df_imsi

In [None]:
# Acomodamos dataset

df_imei.drop(['brand','model'], axis=1, inplace=True)    # Descartamos columnas no necesarias

# Borramos duplicados (drop duplicates no funciona)

duplicateRowsDF = df_imei[df_imei.duplicated(['imei'])]
print(f'Valores duplicados encontrados en df_imei:\n {duplicateRowsDF} \n')
df_imei.drop([54653, 155460], inplace=True)
duplicateRowsDF = df_imei[df_imei.duplicated(['imei'])]
print(f'Valores borrados. Comprobacion:\n {duplicateRowsDF} \n')

df_imei = df_imei.sample(frac=1).reset_index(drop=True)  # Aleatorizamos entradas
df_imei = df_imei.iloc[:4900,:]                        # Seleccionamos 4900 entradas

print(f'{"-"*50}\n')

# Borramos duplicados (drop duplicates no funciona)

duplicateRowsDF = df_imsi[df_imsi.duplicated(['msin'])]
print(f'Valores duplicados encontrados en df_imsi:\n {duplicateRowsDF} \n')
#df_imsi.drop([138052,140548], inplace=True)
#duplicateRowsDF = df_imsi[df_imsi.duplicated(['msin'])]
#print(f'Valores borrados. Comprobacion:\n {duplicateRowsDF}')

df_imsi.drop(['mcc','mnc','msin'], axis=1, inplace=True) # Descartamos columnas no necesarias
df_imsi = df_imsi.sample(frac=1).reset_index(drop=True)  # Aleatorizamos entradas

display(df_imei)
display(df_imsi)

In [None]:
# Generamos la población (actores del circo) y añadimos sus atributos. En este caso rm

id_gen = SequencialGenerator(prefix="rm_")

rm_anom = rm_circus.create_population(name="rm_anom", size=4900, ids_gen=id_gen)
rm_anom.create_attribute("IMEI", init_values=df_imei['imei'].explode().tolist())

In [None]:
rm_anom.to_dataframe()

In [None]:
# Número de entradas en función del tiempo

from trumania.core.clock import CyclicTimerGenerator, CyclicTimerProfile

story_timer_anom_gen = CyclicTimerGenerator(clock=rm_circus.clock,
                                            seed=next(rm_circus.seeder),
                                            config=CyclicTimerProfile(
                                                profile=[1, 10.3, 10.3, .15, .2, .4, 3.8,
                                                         7.2, 8.4, 9.1, 9.0, 8.3, 8.1,
                                                         7.7, 7.4, 7.8, 8.0, 7.9, 9.7,
                                                         10.4, 10.5, 8.8, 5.7, 2.8],
                                                profile_time_steps="1h",
                                                start_date= pd.Timestamp("11 Nov 2020 00:00:00"),
                                            )
                                           )

In [None]:
# Creamos tres permiles de tráfico

low_activity = story_timer_anom_gen.activity(n=4, per=pd.Timedelta("1 day"))
med_activity = story_timer_anom_gen.activity(n=6, per=pd.Timedelta("1 day"))
high_activity = story_timer_anom_gen.activity(n=8, per=pd.Timedelta("1 day"))

In [None]:
# Asignamos a cada permil de tráfico una probabilidad. Cada entrada del dataset tendrá un permil de tráfico único.

activity_gen = NumpyRandomGenerator(
    method="choice", 
    a=[low_activity, med_activity, high_activity],
    p=[.2, .7, .1],
    seed=next(rm_circus.seeder))

In [None]:
# Creación de la historia. Se define población y su actividad en base la tiempo

rm_anom_story = rm_circus.create_story(
    name="rm_anom_story",
    initiating_population=rm_circus.populations["rm_anom"],
    member_id_field="rm_ID",
    
    timer_gen=story_timer_anom_gen,
    activity_gen=activity_gen
)

In [None]:
imsi_rel = rm_circus.populations["rm_anom"].create_relationship("imsi")

In [None]:
# Generamos entrada imsi y asignamos probabilidad.

imsi_gen = NumpyRandomGenerator(method="choice", 
                                   a=df_imsi['imsi'], 
                                   seed=next(rm_circus.seeder))

In [None]:
for w in range(1,3):
    imsi_rel.add_relations(
        from_ids=rm_anom.ids,
        to_ids=imsi_gen.generate(size=rm_anom.size),
        weights=w
    )

In [None]:
# Generamos entrada rat y asignamos probabilidad.

rat_gen = NumpyRandomGenerator(method="choice", 
                                   a=['2G','3G','4G','5G'], 
                                   p=[0.1, 0.3, 0.5, 0.1], 
                                   seed=next(rm_circus.seeder))

In [None]:
# Creamos relación rm -> rat

rat_rel = rm_circus.populations["rm_anom"].create_relationship("rat")
rat_rel.add_relations(from_ids=rm_anom.ids, to_ids=rat_gen.generate(size=rm_anom.size))

In [None]:
# Definimos columnas del dataset que queremos generar.

rm_anom_story.set_operations(
    rm_circus.clock.ops.timestamp(named_as="TIME"),
    rm_circus.populations["rm_anom"].get_relationship("rat")
        .ops.select_one(from_field="rm_ID", named_as="RAT"),
    rm_circus.populations["rm_anom"].get_relationship("imsi")
        .ops.select_one(from_field="rm_ID", named_as="IMSI"),
    rm_circus.populations["rm_anom"].ops.lookup(id_field="rm_ID", select={"IMEI": "IMEI"}),

    operations.FieldLogger(log_id="rm_anom")
)

In [None]:
# Generamos dataset

rm_circus.run(
    duration=pd.Timedelta("15 days"),
    log_output_folder="datasets/trumania_dataset/anomaly",
    delete_existing_logs=True
)

df = pd.read_csv("datasets/trumania_dataset/anomaly/rm_anom.csv", 
                 dtype = {'rm_ID':str,'TIME':str,'RAT':str,'IMEI':str,'IMSI':str})

df = df.sort_values(by=['TIME']).reset_index(drop=True)
df.to_csv('datasets/trumania_dataset/anomaly/rm_anom.csv', index=False)

display(df)

In [None]:
# Convertir columna en valores adecuados (cast). Comprobar tipos.

df['TIME'] = pd.to_datetime(df['TIME']) # Cast string a Timestamp
df.dtypes

In [None]:
# Comprobando dataset

print("Porcentaje de RAT en dataset:")
display(pd.DataFrame({'Porcentaje': df.groupby(('RAT')).size() / len(df) * 100}))

# Comprobando si asignación no cambia al repetirse la entrada

print('Visualzando una entrada:')
display(df[df['rm_ID'] == 'rm_0000000306'].sort_values('TIME'))

In [None]:
df[df['rm_ID'] == 'rm_0000000306'].groupby(by=["IMEI",'IMSI']).count()

In [None]:
usage_per_user = df[["IMSI", "rm_ID"]].groupby("rm_ID")["IMSI"].count()
usage_per_user.plot(kind="hist")

In [None]:
time_profile = (
    df[["IMEI", "TIME"]]
    .groupby(by=df.TIME.dt.hour)["IMEI"]
    .count()
)
time_profile.plot()