Here I'm going to explore the notification moduel and how they work. First, I need to understand their fields

In [1]:
import pandas as pd
from pandas.io.json import json_normalize

In [2]:
df = pd.read_json("../data/base_22012019.json")

In [3]:
notifications_df = df[df["model"] == "notifications.notification"]

In [4]:
notifications_df.shape

(13219, 3)

In [5]:
notifications_df["fields"].head(10)[34636]

{'meta': None,
 'task': 1,
 'user': 8,
 'level': 3,
 'viewed': False,
 'creation_date': '2018-10-28'}

Uma notificação possui os seguintes campos no log:
1. Meta 
2. User que coleta o id do user para qual a notificação foi enviada
3. Level 
4. Viewed que possui valor de verdadeiro ou falso indicando se a notifcação foi vista ou não pelo usuário
5. A data de criação daquela notificação

# Limpeza dos dados

é preciso extrair as colunas existentes dentro da coluna "fields", para isso, eu vou utilizar a função json_normalize, nativa do pandas.

In [6]:
def extract_flatten_dataframe(df, column, meta_list):
    df_fields = json_normalize(data=df[column], meta=meta_list)
    df_fields.index = df.index
    return df_fields.join(df, how="outer")

In [7]:
notification_fields = json_normalize(data=notifications_df["fields"], meta=["meta", "task", "user", "level", "viewed", "creation_date"] )
notification_fields.index = notifications_df.index
notification_df_flatted = notification_fields.join(notifications_df, how="outer")

In [8]:
notification_df_flatted.sample(10)

Unnamed: 0,creation_date,level,meta,task,user,viewed,fields,model,pk
42232,2018-12-08,3,,14,40,False,"{'meta': None, 'task': 14, 'user': 40, 'level'...",notifications.notification,7809
35919,2018-11-12,1,,12,71,False,"{'meta': None, 'task': 12, 'user': 71, 'level'...",notifications.notification,1286
39197,2018-11-25,3,,16,41,False,"{'meta': None, 'task': 16, 'user': 41, 'level'...",notifications.notification,4644
44305,2019-01-14,1,,18,33,True,"{'meta': None, 'task': 18, 'user': 33, 'level'...",notifications.notification,9952
44500,2019-01-15,3,,42,19,False,"{'meta': None, 'task': 42, 'user': 19, 'level'...",notifications.notification,10147
45752,2019-01-18,3,,13,34,False,"{'meta': None, 'task': 13, 'user': 34, 'level'...",notifications.notification,11399
45788,2019-01-18,3,,14,68,False,"{'meta': None, 'task': 14, 'user': 68, 'level'...",notifications.notification,11435
41137,2018-12-02,4,,8,59,False,"{'meta': None, 'task': 8, 'user': 59, 'level':...",notifications.notification,6654
43035,2018-12-14,1,,42,18,False,"{'meta': None, 'task': 42, 'user': 18, 'level'...",notifications.notification,8652
36391,2018-11-14,3,,11,78,True,"{'meta': None, 'task': 11, 'user': 78, 'level'...",notifications.notification,1758


In [9]:
notification_fields["meta"].value_counts()

2018-11-11T01:44:00Z    45
2018-11-14T12:33:00Z     6
2018-12-01T00:00:00Z     4
2018-11-12T13:33:00Z     3
2018-11-27T22:34:00Z     3
2018-11-14T23:00:00Z     3
2018-11-11T23:22:00Z     2
2018-11-19T02:30:00Z     1
2018-11-07T02:46:00Z     1
2018-12-02T02:26:00Z     1
2018-11-13T15:19:00Z     1
2018-11-23T14:05:00Z     1
2018-11-07T03:40:00Z     1
Name: meta, dtype: int64

In [10]:
notification_fields.dtypes

creation_date    object
level             int64
meta             object
task              int64
user              int64
viewed             bool
dtype: object

Como dá pra ver o pandas não consegue inferir muito bem os data types das colunas, vou utilizar meu conhecimento sobre elas para colocar valores mais baratos e que correspondem melhor aos seus valores.m

In [11]:
notification_fields = notification_fields.astype({"level": pd.CategoricalDtype()})

In [12]:
notification_fields["level"].describe()

count     13219
unique        4
top           3
freq       6554
Name: level, dtype: int64

In [13]:
notification_fields.dtypes

creation_date      object
level            category
meta               object
task                int64
user                int64
viewed               bool
dtype: object

In [14]:
notification_fields.shape

(13219, 6)

# Users Data
Como os usuários são o principal objeto de pesquisa deste relatório, eu preciso coletar os IDs deles para juntar as notificações e "dar à um responsável".

In [15]:
users_df = df[df["model"] == "users.user"]
users_df

Unnamed: 0,fields,model,pk
34528,{'password': 'pbkdf2_sha256$30000$5DROf4Pf3BPR...,users.user,1
34529,{'password': 'pbkdf2_sha256$30000$u3Lg5la328P5...,users.user,2
34530,{'password': 'pbkdf2_sha256$30000$Tg55KDswGl6t...,users.user,3
34531,{'password': 'pbkdf2_sha256$30000$sN16VkfYDIiS...,users.user,4
34532,{'password': 'pbkdf2_sha256$30000$x85db28ZviCz...,users.user,6
34533,{'password': 'pbkdf2_sha256$30000$YuhP7bGSlXwn...,users.user,7
34534,{'password': 'pbkdf2_sha256$30000$l7WAB2nBwfTg...,users.user,8
34535,{'password': 'pbkdf2_sha256$30000$DOj6RbuvrpDK...,users.user,9
34536,{'password': 'pbkdf2_sha256$30000$wbwdFEajjHed...,users.user,10
34537,{'password': 'pbkdf2_sha256$30000$DOPLvJQYYOz4...,users.user,11


In [16]:
users_df["fields"][34528]

{'password': 'pbkdf2_sha256$30000$5DROf4Pf3BPR$tPd1JbDlLqrUzoY/JcUSfparBrp7n0ERZKWgpfnbG/Y=',
 'last_login': '2019-01-22T12:36:36.447Z',
 'is_superuser': True,
 'email': 'admin@amadeus.br',
 'username': 'Administrador',
 'last_name': 'Geral',
 'social_name': None,
 'description': '',
 'image': '',
 'date_created': '2018-10-19T16:55:27.084Z',
 'last_update': '2018-11-04T03:31:13.036Z',
 'show_email': 1,
 'is_staff': True,
 'is_active': True,
 'groups': [],
 'user_permissions': []}

# Dicionário dos dados
1. Password: Senha criptografada
2. Last_Login: a última vez que o usuário logou no sistema
3. is_superuser: Significa que o usuário é um super usuário, tem privilégios ou admin.
4. email: e-mail pertecente ao usuário 
5. username: nome que o usuário quer que outros usuários o vejam
6. last_name: Sobrenome do usuário
7. social_name: Nome social que o usuário quer escolher para ser demonstrado 
8. description: ?
9. image: Caminho para a imagem do usuário
10. date_created: Data em que o usuário foi criado
11. last_update: a última vez que os dados do usuário foram modificados
12. show_email: Uma booleana que informa se o e-mail é visiível para outros usuários
13. is_staff: se ele é do tipo admin
14. is_active: Se o usuário está ativo, caso sim, ele pode entrar no sistema, senão, é impossível
15. groups: Grupos de permissão ao qual esse usuário pertence ("professor", "estudante"...)
16. user_permissions = permissões individuais que ele possui ("criar tópico" , "deletar tópico", "editar tópico")

In [17]:
# fields:  
user_fields = ["password", "last_login", "is_superuser", "email", "username", "last_name", "social_name", "description", 
               "imagem", "date_created", "last_update", "show_email", "is_staff", "is_active", "groups", "user_permissions"]

In [30]:
user_df_flatten = extract_flatten_dataframe(users_df, column="fields",meta_list=user_fields).astype({"pk": "int64"})

In [31]:
user_df_flatten.dtypes

date_created        object
description         object
email               object
groups              object
image               object
is_active             bool
is_staff              bool
is_superuser          bool
last_login          object
last_name           object
last_update         object
password            object
show_email           int64
social_name         object
user_permissions    object
username            object
fields              object
model               object
pk                   int64
dtype: object

# Vou remover as seguintes colunas:
1. Groups, pois não possui nenhum valor diferente de vazio (ou nulo).

In [19]:
user_df_flatten["groups"].value_counts()

[]    106
Name: groups, dtype: int64

In [60]:
user_df_flatten_clean = user_df_flatten.drop(["groups", "fields", "password"], axis=1)

In [61]:
user_df_flatten_clean.sample(10)

Unnamed: 0,date_created,description,email,image,is_active,is_staff,is_superuser,last_login,last_name,last_update,show_email,social_name,user_permissions,username,model,pk
34570,2018-11-03T13:56:08.739Z,<p>\r\n\r\n\r\n\r\n\r\n<style>\r\n<!--table\r\...,11070929492@r1.br,,True,False,False,2019-01-21T13:33:35.363Z,.,2018-11-03T13:56:08.776Z,1,STEFANY OLIVEIRA JORGE,[],STEFANY OLIVEIRA JORGE,users.user,44
34592,2018-11-03T14:07:26.283Z,<p>\r\n\r\n\r\n\r\n\r\n<style>\r\n<!--table\r\...,10029996422@r1.br,users/Amadeus_.jpg,True,False,False,2019-01-21T17:40:33.319Z,.,2018-11-28T18:31:24.918Z,1,JOAO CAETANO RODRIGUES NASCIMENTO,[],JOAO CAETANO RODRIGUES NASCIMENTO,users.user,66
34562,2018-11-03T13:51:53.092Z,<p>\r\n\r\n\r\n\r\n\r\n<style>\r\n<!--table\r\...,06649884588@r1.br,,True,False,False,2019-01-21T17:21:56.167Z,.,2018-11-03T13:51:53.126Z,1,MATHEUS MORAIS BELEM,[],MATHEUS MORAIS BELEM,users.user,36
34559,2018-11-03T13:50:05.686Z,<p>\r\n\r\n\r\n\r\n\r\n<style>\r\n<!--table\r\...,04443072578@r1.br,,True,False,False,2019-01-21T13:22:15.455Z,.,2018-11-03T13:50:05.720Z,1,LARISSA VIEIRA DA SILVA,[],LARISSA VIEIRA DA SILVA,users.user,33
34541,2018-11-03T13:40:04.839Z,<p>\r\n\r\n\r\n\r\n\r\n<style>\r\n<!--table\r\...,06352736599@r1.br,,True,False,False,2018-12-11T16:11:43.967Z,.,2018-11-03T13:40:04.873Z,1,BRENDO SANTOS SILVA,[],BRENDO SANTOS SILVA,users.user,15
34530,2018-10-22T18:21:31.704Z,<p>Docente vinculado ao Colegiado Acadêmico de...,joao.sedraz@univasf.edu.br,users/sedraz.jpg,True,False,False,2019-01-22T13:45:10.245Z,Silva,2018-12-06T12:03:28.281Z,1,João Sedraz,[],João Carlos Sedraz,users.user,3
34552,2018-11-03T13:46:23.653Z,<p>\r\n\r\n\r\n\r\n\r\n<style>\r\n<!--table\r\...,05228647597@r1.br,users/Iphone_05042016_047.JPG,True,False,False,2019-01-22T14:12:03.326Z,.,2019-01-19T02:13:23.766Z,1,HIAGO TEIXEIRA BAVOSA,[],HIAGO TEIXEIRA BAVOSA,users.user,26
34608,2018-11-03T14:16:02.137Z,<p>\r\n\r\n\r\n\r\n\r\n<style>\r\n<!--table\r\...,08703735443@r1.br,,True,False,False,2019-01-22T03:02:02.440Z,.,2018-11-03T14:16:02.183Z,1,RICARDO ANDRADE MACEDO,[],RICARDO ANDRADE MACEDO,users.user,82
34589,2018-11-03T14:05:46.801Z,<p>\r\n\r\n\r\n\r\n\r\n<style>\r\n<!--table\r\...,06503100558@r1.br,,True,False,False,2018-12-05T02:22:17.107Z,.,2018-11-03T14:05:46.839Z,1,IGOR FHILIPE DE CASTRO ALMEIDA NEVES,[],IGOR FHILIPE DE CASTRO ALMEIDA NEVES,users.user,63
34529,2018-10-21T05:32:02.734Z,,egz@cin.ufpe.br,users/kaneki.jpg,True,True,False,2019-01-19T00:28:17.924Z,Zambom,2018-10-22T14:00:03.256Z,1,,[],Erik,users.user,2


# Junção entre usuários e notificações
Para otimizar, irei coletar somente o username dos usuários durante o join com as notificações, depois eu irei buscar mais informações sobre eles.m

In [57]:
notification_df_flatted.dtypes

creation_date    object
level             int64
meta             object
task              int64
user              int64
viewed             bool
fields           object
model            object
pk               object
dtype: object

In [62]:
user_df_flatten_clean.dtypes

date_created        object
description         object
email               object
image               object
is_active             bool
is_staff              bool
is_superuser          bool
last_login          object
last_name           object
last_update         object
show_email           int64
social_name         object
user_permissions    object
username            object
model               object
pk                   int64
dtype: object

In [64]:
notification_df_flatted = notification_df_flatted.reset_index(drop=True)
user_df_flatten_clean = user_df_flatten_clean.reset_index(drop=True)

In [66]:
join_selection = (notification_df_flatted["user"] == user_df_flatten_clean["pk"])

ValueError: Can only compare identically-labeled Series objects

In [51]:
optimal_df = notification_df_flatted.join(user_df_flatten_clean, notification_df_flatted["user"] == user_df_flatten_clean["pk"])

ValueError: Can only compare identically-labeled Series objects