In [1]:
import pandas as pd
import numpy as np
import json 
from pandas import json_normalize 
import random 
import matplotlib.pyplot as plt
from datetime import datetime

import findspark
findspark.init()

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as F

from pyspark.sql.types import *

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [2]:
# Cargamos el dataset con formato .json y eliminamos columnas innecesarias

df = pd.read_json('datasets/dataset_conjunto_rm.json')
df.drop(["event","id","type","version","time"], axis=1, inplace=True)

In [3]:
# Expandimos columna data

df = pd.DataFrame(json_normalize(df['data'].explode()))

In [4]:
df

Unnamed: 0,time,imei,imsi,rat
0,1596800108,35937204822971,901700000015702,2G
1,1596796932,35937204822971,901700000015700,2G
2,1596294959,35937204822971,901700000015700,2G
3,1596296538,35283606665993,901700000015702,2G
4,1596297428,35818705617048,901700000015704,2G
5,1596297758,35941008080436,901700000013634,2G
6,1596297972,35655408691613,901700000015777,2G
7,1596039825,35583902478151,901700000015766,2G
8,1596039963,35583902478151,901700000015766,2G
9,1596040938,35583902478151,901700000015766,2G


In [5]:
# Cambio de formato de time

df['time'] = pd.to_datetime(df['time'], unit='s')

In [6]:
# Separamos año, mes, dia, hora, minuto y segundo de time_data

df['year']   = df['time'].dt.year
df['month']  = df['time'].dt.month
df['day']    = df['time'].dt.day
df['hour']   = df['time'].dt.hour
df['minute'] = df['time'].dt.minute
df['second'] = df['time'].dt.second

#df.drop('time', axis = 1 , inplace = True)

In [7]:
# Separamos mcc, mnc y msin del IMSI

df['imsi_mcc']   = df['imsi'].str.slice(0,3)
df['imsi_mnc']  = df['imsi'].str.slice(3,5)
df['imsi_msin']    = df['imsi'].str.slice(5)

#df.drop('imsi' ,axis = 1 , inplace = True)

In [8]:
# Separamos tac, fac y snr del IMEI/IMEISV

df['imei_tac+fac']   = df['imei'].str.slice(0,8)
df['imei_snr']  = df['imei'].str.slice(8)

#df.drop('imei' ,axis = 1 , inplace = True)

In [11]:
# Agregamos manualmente la clasificación de cada una de los datos (0 = Normal , 1 = Anomalía)

clasf = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,1]
df['clasification']=clasf

In [12]:
df = df.sort_values(by=['time'])
df

Unnamed: 0,time,imei,imsi,rat,year,month,day,hour,minute,second,imsi_mcc,imsi_mnc,imsi_msin,imei_tac+fac,imei_snr,clasification
7,2020-07-29 16:23:45,35583902478151,901700000015766,2G,2020,7,29,16,23,45,901,70,15766,35583902,478151,0
8,2020-07-29 16:26:03,35583902478151,901700000015766,2G,2020,7,29,16,26,3,901,70,15766,35583902,478151,0
9,2020-07-29 16:42:18,35583902478151,901700000015766,2G,2020,7,29,16,42,18,901,70,15766,35583902,478151,0
2,2020-08-01 15:15:59,35937204822971,901700000015700,2G,2020,8,1,15,15,59,901,70,15700,35937204,822971,0
10,2020-08-01 15:16:35,35362707574785,901700000013633,2G,2020,8,1,15,16,35,901,70,13633,35362707,574785,0
11,2020-08-01 15:18:17,35362707574785,901700000013633,2G,2020,8,1,15,18,17,901,70,13633,35362707,574785,0
12,2020-08-01 15:26:35,35362707574785,262421122334455,2G,2020,8,1,15,26,35,262,42,1122334455,35362707,574785,0
13,2020-08-01 15:41:48,1366500442359,901700000015708,2G,2020,8,1,15,41,48,901,70,15708,1366500,442359,0
14,2020-08-01 15:41:48,35922407923033,262421122334455,2G,2020,8,1,15,41,48,262,42,1122334455,35922407,923033,0
3,2020-08-01 15:42:18,35283606665993,901700000015702,2G,2020,8,1,15,42,18,901,70,15702,35283606,665993,0


In [16]:
clasf = df[df['clasification'] == 1]
plt.figure(figsize=(18,3))
plt.plot(clasf['imei'], linestyle='none', marker='X', color='red', markersize=12)
plt.plot(df['time'], color='blue')
plt.show()

TypeError: tzinfo argument must be None or of a tzinfo subclass, not type 'UnitData'

In [None]:
df['time'].values

In [None]:
# StringIndexer para rat, imsi e imei

df['rat'] = pd.factorize(df['rat'].values)[0]
df['imsi_mcc'] = pd.factorize(df['imsi_mcc'].values)[0]
df['imsi_mnc'] = pd.factorize(df['imsi_mnc'].values)[0]
df['imsi_msin'] = pd.factorize(df['imsi_msin'].values)[0]
df['imei_tac+fac'] = pd.factorize(df['imei_tac+fac'].values)[0]
df['imei_snr'] = pd.factorize(df['imei_snr'].values)[0]

In [None]:
# MinMaxScaler para el tiempo
from sklearn.preprocessing import minmax_scale

year_scaled = minmax_scale(df['year'])
df['year'] = pd.DataFrame(year_scaled)

df['month'] = df['month'].apply(lambda x: (x - 1) / (12 - 1))
df['day'] = df['day'].apply(lambda x: (x - 1) / (31 - 1))
df['hour'] = df['hour'].apply(lambda x: (x - 0) / (23 - 0))
df['minute'] = df['minute'].apply(lambda x: (x - 0) / (59 - 0))
df['second'] = df['second'].apply(lambda x: (x - 0) / (59 - 0))


In [None]:
df

In [None]:
df

In [None]:
# Guardamos clasificación en y
y = df['clasification'].values
df.drop('clasification', axis = 1, inplace = True)

In [None]:
df

In [None]:
# Visualizacion 2D y 3D mediante PCA de los resultados arrojados por KMeans

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
df_pca = pca.fit_transform(df)

fig, ax = plt.subplots(figsize=(10, 6))

for index, n_color in enumerate(y):
    ax.scatter(df_pca[index,0],df_pca[index,1], s=100, c="blue" if n_color == 0 else "red")
    
ax.set_title('Dataset original clasificado')
ax.set_xlabel('PCA_1')
ax.set_ylabel('PCA_2')

plt.style.use('seaborn')
plt.show()