O objetivo deste notebook é criar o dataset final para teste e treinamento da classificação entre sucesso ou insucesso do aluno (sucesso = passar por média, acima ou igual à 7,0).

As features deste modelo são baseadas em notificações e a quantidade pendências apenas

In [1]:
import pandas as pd
from pandas.io.json import json_normalize
%matplotlib inline
from datetime import datetime, timedelta

# Load de dados

In [2]:
df = pd.read_json("../../../data/raw_data/base_22012019.json")
grades_df = pd.read_excel("../../../data/raw_data/nota_provas.xls")

In [13]:
date_v1 = datetime(day=10, year=2018, month=12)
date_v2 = datetime(day=11, month=2, year=2019)
date_v3 = datetime(day=18, month=3, year=2019)

# Definição de datasets principais

In [3]:
notifications_df = df[df["model"] == "notifications.notification"]
resources_df = df[df["model"] == "topics.resource"]
pendencies_df = df[df["model"] == "pendencies.pendencies"]

In [4]:
def extract_flatten_dataframe(df, column, meta_list):
    df_fields = json_normalize(data=df[column], meta=meta_list)
    df_fields.index = df.index
    return df_fields.join(df, how="outer")

# Limpeza dos dados

In [5]:
def clean_notifications_df(dirty_notifications_df):
    notification_fields = json_normalize(data=dirty_notifications_df["fields"], meta=["meta", "task", "user", "level", "viewed", "creation_date"] )
    notification_fields.index = dirty_notifications_df.index
    notification_fields = notification_fields.astype({"level": pd.CategoricalDtype()})
    notification_fields["creation_date"] = pd.to_datetime(notification_fields["creation_date"])
    notification_df_flatted = notification_fields.join(dirty_notifications_df, how="outer")
    return notification_df_flatted

In [6]:
def clean_resources_df(dirty_resources_df):
    resource_meta_list = ["_my_subclass", "visible", "students", "groups", "tags", "all_students", "order", "topic", 
             "show_window", "brief_description", "slug", "name"]
    resources_flatten_df = extract_flatten_dataframe(dirty_resources_df, "fields", resource_meta_list).drop("fields", axis=1)
    return resources_flatten_df

In [7]:
def clean_pendencies_df(dirty_pendencies_df):
    pendencies_meta_list = ["action", "begin_date", "end_date", "limit_date", "resource"]
    flattned_dirty_pendencies_df = extract_flatten_dataframe(dirty_pendencies_df, "fields", pendencies_meta_list)
    date_columns = ["begin_date", "end_date", "limit_date"]
    for column in date_columns:
        flattned_dirty_pendencies_df[column] = pd.to_datetime(flattned_dirty_pendencies_df[column])
        flattned_dirty_pendencies_df[column] = flattned_dirty_pendencies_df[column].apply(lambda timestamp: timestamp.replace(tzinfo=None))
    flattned_dirty_pendencies_df["action"] = flattned_dirty_pendencies_df["action"].astype("category")
    return flattned_dirty_pendencies_df

In [9]:
cleaned_notification_df = clean_notifications_df(notifications_df)
cleaned_pendencies_df = clean_pendencies_df(pendencies_df)
cleaned_resources_df = clean_resources_df(resources_df)

# Junções para os datasets

In [18]:
notifications_with_user_grades_df = pd.merge(cleaned_notification_df, grades_df, left_on=["user"], right_on=["id_estudante"])
pendencies_with_resources = pd.merge(cleaned_pendencies_df, cleaned_resources_df, left_on=["resource"], right_on=["pk"])
notications_with_resources_pendencies = pd.merge(notifications_with_user_grades_df, pendencies_with_resources, left_on=["task"], right_on=["pk_x"])
v1_notifications = notications_with_resources_pendencies[notications_with_resources_pendencies["end_date"] <= date_v1]\
.drop(columns=["v2"])
v1_notifications.index = v1_notifications["creation_date"]

In [26]:
train_columns = ["user", "v1", "creation_date", "viewed"]
train_dataset = v1_notifications[train_columns]

# Adicionar Features no dataset

In [34]:
def set_weeks_apart(df, date, date_column, week_diff_column):
    """
        Create a new column week_diff_column that contains the amount of weeks between the dates on the date_column
        of a given df to a related date
    """
    return df[date_column].apply(lambda dt: (date - dt).days//7)

def create_user_notification_count_features(dirty_v1_notifications_df, v1_date):
    v1_notifications_viewed_date = v1_notifications.groupby(["user", pd.Grouper(freq="W")])\
    .agg({"viewed": ["mean", "sum"]}).reset_index()
    viewed_df = v1_notifications_viewed_date[["user", "creation_date"]]
    viewed_df["mean"] = v1_notifications_viewed_date["viewed"]["mean"]
    viewed_df["count"] = v1_notifications_viewed_date["viewed"]["sum"]
    viewed_df["amount_of_weeks_prior_v1"] = set_weeks_apart(viewed_df, v1_date, "creation_date", "test")
    viewed_df = viewed_df.rename(columns={"count": "notifications_viewed", "mean": "average_amount_viewed"})
    return viewed_df

In [32]:
temp_features_df = create_user_notification_count_features(train_dataset, date_v1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [33]:
temp_features_df.head(10)

Unnamed: 0,user,creation_date,average_amount_viewed,notifications_viewed,week_count_v1
,,,,,
0.0,11.0,2018-11-11,0.0,0.0,-5.0
1.0,11.0,2018-11-18,0.0,0.0,-4.0
2.0,11.0,2018-11-25,0.147541,9.0,-3.0
3.0,11.0,2018-12-02,0.0,0.0,-2.0
4.0,11.0,2018-12-09,0.157895,9.0,-1.0
5.0,11.0,2018-12-16,0.0,0.0,0.0
6.0,11.0,2018-12-23,0.0,0.0,1.0
7.0,11.0,2019-01-06,0.0,0.0,3.0
8.0,11.0,2019-01-13,0.0,0.0,4.0
