In [1]:
import pandas as pd
from pandas.io.json import json_normalize


In [42]:
raw_df = pd.read_json("../../data/raw_data/base_18032019.json")

In [3]:
raw_df.columns

Index(['fields', 'model', 'pk'], dtype='object')

In [4]:
def extract_flatten_dataframe(df, column, meta_list):
    df_fields = json_normalize(data=df[column], meta=meta_list)
    df_fields.index = df.index
    return df_fields.join(df, how="outer")

# Clean data

## drop models that won't be useful for the analysis

In [105]:
exclude_models = ["oauth2_provider.application", "oauth2_provider.refreshtoken", "oauth2_provider.accesstoken",
                 "themes.themes", "security.security", "mailsender.mailsender"]
cleaned_raw_df = raw_df[~raw_df["model"].isin(exclude_models)]

In [106]:
df_extracted = extract_flatten_dataframe(cleaned_raw_df, "fields", None)

In [107]:
df_extracted.columns

Index(['_my_subclass', 'action', 'active', 'all_students', 'alt_img', 'answer',
       'begin_date', 'brief_description', 'categories', 'category',
       ...
       'user_quest', 'user_two', 'username', 'value', 'viewed', 'visible',
       'xls_data', 'fields', 'model', 'pk'],
      dtype='object', length=178)

In [148]:
drop_columns = ["username", "fields", "alt_img", "context.user_name", "social_name", 
                "user_email", "last_name", "question_img", "password", "csv_data", "xls_data", "image",
                "session_data", "user", "context.new_creator", "content", "context.new_creator_user",
               "ran_at_time", "email", "user_email", "context.student", "context.user_email", "device_id"]

In [149]:
df_cleaned = df_extracted.drop(columns=drop_columns)

In [150]:
for column in df_cleaned.columns:
    print(column)

_my_subclass
action
active
all_students
answer
begin_date
brief_description
categories
category
code
comment
component
context
context.bulletin_id
context.bulletin_name
context.bulletin_slug
context.category_id
context.category_name
context.category_slug
context.comment_id
context.condition
context.dependencies
context.goals_id
context.goals_name
context.goals_slug
context.group_id
context.group_name
context.group_slug
context.history_page
context.is_correct
context.new_slug
context.new_title
context.notification
context.notification_id
context.pdffile_id
context.pdffile_name
context.pdffile_slug
context.post_id
context.post_space_id
context.post_space_name
context.post_space_slug
context.question_content
context.question_id
context.questionary_id
context.questionary_name
context.questionary_slug
context.resource_id
context.resource_name
context.resource_slug
context.search_by
context.search_for
context.searched
context.subject_id
context.subject_name
context.subject_slug
context.talk_

In [174]:
df_cleaned["context.timestamp_end"].value_counts()

1549897206    301
1542315016    270
1542155773    248
1541802642    240
1541802643    194
1542315017    193
1542026526    155
1542304122    155
1542027382    154
1541803863    150
1542154953    146
1542166959    143
1541813396    135
1549887746    128
1541787923    112
1542304121    100
1549887128     98
1544058652     93
1541802641     89
1544464922     86
1549807071     83
1542301583     82
1541639233     80
1544291973     78
1542250988     78
1542026739     76
1542250987     76
1544277469     76
1544055012     70
1544449373     65
             ... 
1544144333      1
1543931924      1
1543714863      1
1542323853      1
1544367797      1
1548079157      1
1544909447      1
1548092412      1
1543349432      1
1547468278      1
1549899938      1
1541471395      1
1552597037      1
1548444319      1
1541606949      1
1550427857      1
1552161509      1
1542199432      1
1547553626      1
1549810848      1
1540234782      1
1544734408      1
1549803910      1
1541779800      1
1549898972

In [146]:
df_cleaned[df_cleaned["context.condition"] == "session_expire"]

Unnamed: 0,_my_subclass,action,active,all_students,answer,begin_date,brief_description,categories,category,code,...,user_id,user_one,user_permissions,user_quest,user_two,value,viewed,visible,model,pk
130677,,logout,True,True,,NaT,,,,,...,1.0,,,,,,,,log.log,2
130974,,logout,True,True,,NaT,,,,,...,1.0,,,,,,,,log.log,299
131007,,logout,True,True,,NaT,,,,,...,2.0,,,,,,,,log.log,332
131022,,logout,True,True,,NaT,,,,,...,1.0,,,,,,,,log.log,347
131210,,logout,True,True,,NaT,,,,,...,1.0,,,,,,,,log.log,535
131626,,logout,True,True,,NaT,,,,,...,4.0,,,,,,,,log.log,951
131762,,logout,True,True,,NaT,,,,,...,2.0,,,,,,,,log.log,1087
131937,,logout,True,True,,NaT,,,,,...,4.0,,,,,,,,log.log,1262
132273,,logout,True,True,,NaT,,,,,...,4.0,,,,,,,,log.log,1598
132880,,logout,True,True,,NaT,,,,,...,3.0,,,,,,,,log.log,2205


# Send cleaned data without privacy concern variables

In [136]:
df_cleaned.dtypes

_my_subclass              object
action                    object
active                    object
all_students              object
answer                   float64
begin_date                object
brief_description         object
categories                object
category                 float64
code                      object
comment                   object
component                 object
context                   object
context.bulletin_id      float64
context.bulletin_name     object
context.bulletin_slug     object
context.category_id      float64
context.category_name     object
context.category_slug     object
context.comment_id        object
context.condition         object
context.dependencies      object
context.goals_id         float64
context.goals_name        object
context.goals_slug        object
context.group_id         float64
context.group_name        object
context.group_slug        object
context.history_page      object
context.is_correct        object
          

In [168]:
dtypes_mapping = {
    "_my_subclass": "category",
    "action": "category",
    "active": "bool",
    "all_students": "bool",
    "begin_date": "datetime64",
    "context.is_correct": "bool",
    "visible": "bool",
    "show_email": "category",
    "show_window": "bool",
    "start_time": "datetime64",
    "update_date": "datetime64"
}

In [169]:
for column in dtypes_mapping.keys():
    df_cleaned[column] = df_cleaned[column].astype(dtypes_mapping[column])

In [172]:
df_cleaned.dtypes

_my_subclass                   category
action                         category
active                             bool
all_students                       bool
answer                          float64
begin_date               datetime64[ns]
brief_description                object
categories                       object
category                        float64
code                             object
comment                          object
component                        object
context                          object
context.bulletin_id             float64
context.bulletin_name            object
context.bulletin_slug            object
context.category_id             float64
context.category_name            object
context.category_slug            object
context.comment_id               object
context.condition                object
context.dependencies             object
context.goals_id                float64
context.goals_name               object
context.goals_slug               object


In [175]:
df_cleaned.to_csv("../../data/privacy_protected_data/cleaned_data.csv")