O objetivo deste notebook é criar o dataset final para teste e treinamento da classificação entre sucesso ou insucesso do aluno (sucesso = passar por média, acima ou igual à 7,0).

As features deste modelo são baseadas em notificações e a quantidade pendências apenas

In [1]:
import pandas as pd
from pandas.io.json import json_normalize
%matplotlib inline
from datetime import datetime, timedelta

# Load de dados

In [2]:
df = pd.read_json("../../../data/raw_data/base_22012019.json")
grades_df = pd.read_excel("../../../data/raw_data/nota_provas.xls")

In [3]:
date_v1 = datetime(day=10, year=2018, month=12)
date_v2 = datetime(day=11, month=2, year=2019)
date_v3 = datetime(day=18, month=3, year=2019)

# Definição de datasets principais

In [4]:
notifications_df = df[df["model"] == "notifications.notification"]
resources_df = df[df["model"] == "topics.resource"]
pendencies_df = df[df["model"] == "pendencies.pendencies"]

In [5]:
def extract_flatten_dataframe(df, column, meta_list):
    df_fields = json_normalize(data=df[column], meta=meta_list)
    df_fields.index = df.index
    return df_fields.join(df, how="outer")

# Limpeza dos dados

In [6]:
def clean_notifications_df(dirty_notifications_df):
    notification_fields = json_normalize(data=dirty_notifications_df["fields"], meta=["meta", "task", "user", "level", "viewed", "creation_date"] )
    notification_fields.index = dirty_notifications_df.index
    notification_fields = notification_fields.astype({"level": pd.CategoricalDtype()})
    notification_fields["creation_date"] = pd.to_datetime(notification_fields["creation_date"])
    notification_df_flatted = notification_fields.join(dirty_notifications_df, how="outer")
    return notification_df_flatted

In [7]:
def clean_resources_df(dirty_resources_df):
    resource_meta_list = ["_my_subclass", "visible", "students", "groups", "tags", "all_students", "order", "topic", 
             "show_window", "brief_description", "slug", "name"]
    resources_flatten_df = extract_flatten_dataframe(dirty_resources_df, "fields", resource_meta_list).drop("fields", axis=1)
    return resources_flatten_df

In [8]:
def clean_pendencies_df(dirty_pendencies_df):
    pendencies_meta_list = ["action", "begin_date", "end_date", "limit_date", "resource"]
    flattned_dirty_pendencies_df = extract_flatten_dataframe(dirty_pendencies_df, "fields", pendencies_meta_list)
    date_columns = ["begin_date", "end_date", "limit_date"]
    for column in date_columns:
        flattned_dirty_pendencies_df[column] = pd.to_datetime(flattned_dirty_pendencies_df[column])
        flattned_dirty_pendencies_df[column] = flattned_dirty_pendencies_df[column].apply(lambda timestamp: timestamp.replace(tzinfo=None))
    flattned_dirty_pendencies_df["action"] = flattned_dirty_pendencies_df["action"].astype("category")
    return flattned_dirty_pendencies_df

In [9]:
cleaned_notification_df = clean_notifications_df(notifications_df)
cleaned_pendencies_df = clean_pendencies_df(pendencies_df)
cleaned_resources_df = clean_resources_df(resources_df)

# Junções para os datasets

In [21]:
notifications_with_user_grades_df = pd.merge(cleaned_notification_df, grades_df, left_on=["user"], right_on=["id_estudante"])
pendencies_with_resources = pd.merge(cleaned_pendencies_df, cleaned_resources_df, left_on=["resource"], right_on=["pk"])
notications_with_resources_pendencies = pd.merge(notifications_with_user_grades_df, pendencies_with_resources, left_on=["task"], right_on=["pk_x"])
v1_notifications = notications_with_resources_pendencies[notications_with_resources_pendencies["end_date"] <= date_v1]\
.drop(columns=["v2"])
v1_notifications.index = v1_notifications["creation_date"]
user_df = pd.DataFrame(v1_notifications[["user", "v1"]])
user_df = user_df.rename(columns={0: "user"})

In [25]:
user_df = user_df.drop_duplicates()

# Adicionar Features no dataset

In [11]:
v1_notifications["simulado_delayed"] = v1_notifications["name"].str.contains("Simulado") & (v1_notifications["level"] == 3)
v1_notifications["simulado_incompleted"] = v1_notifications["name"].str.contains("Simulado") & (v1_notifications["level"] == 4)
v1_notifications["notes_delayed"] = v1_notifications["name"].str.contains("Aula") & (v1_notifications["level"] == 3)

In [12]:
def set_weeks_apart(df, date, date_column, week_diff_column):
    """
        Create a new column week_diff_column that contains the amount of weeks between the dates on the date_column
        of a given df to a related date
    """
    return df[date_column].apply(lambda dt: (date - dt).days//7)

def create_user_notification_count_features(dirty_v1_notifications_df, v1_date):
    v1_notifications_viewed_date = v1_notifications.groupby(["user", pd.Grouper(freq="W")])\
    .agg({"viewed": ["mean", "sum"]}).reset_index()
    viewed_df = v1_notifications_viewed_date[["user", "creation_date"]]
    viewed_df["mean"] = v1_notifications_viewed_date["viewed"]["mean"]
    viewed_df["count"] = v1_notifications_viewed_date["viewed"]["sum"]
    viewed_df["amount_of_weeks_prior_v1"] = set_weeks_apart(viewed_df, v1_date, "creation_date", "test")
    
    viewed_df = viewed_df.rename(columns={"count": "notifications_viewed", "mean": "average_amount_viewed"})
    
    return viewed_df

In [13]:
time_related_columns = ["user", "v1", "creation_date", "viewed"]
time_features_related_df = create_user_notification_count_features(v1_notifications[time_related_columns], date_v1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [14]:
def notifications_level_features(notifications_dataset):
    temp_df = notifications_dataset.groupby("user").agg({"simulado_delayed": ["sum"], "simulado_incompleted": ["sum"], 
                                               "notes_delayed": ["sum"], "v1": ["first"]}).reset_index()
    features_df = temp_df["user"]
    features_df = pd.DataFrame(features_df)
    features_df["simulado_delayed_amount"] = temp_df["simulado_delayed"]["sum"]
    features_df["simulado_incompleted_amount"] = temp_df["simulado_incompleted"]["sum"]
    features_df["notes_delayed_amount"] = temp_df["notes_delayed"]["sum"]
    return features_df
    

In [15]:
level_columns = ["user", "v1", "simulado_delayed", "simulado_incompleted", "notes_delayed"]
notification_level_features_df = notifications_level_features(v1_notifications[level_columns])

In [26]:
train_dataset = pd.merge(user_df, notification_level_features_df, left_on=["user"], right_on=["user"])


In [27]:
train_dataset

Unnamed: 0,user,v1,simulado_delayed_amount,simulado_incompleted_amount,notes_delayed_amount
0,11,0.00,4.0,52.0,152.0
1,12,4.00,0.0,0.0,1.0
2,13,6.50,0.0,0.0,10.0
3,14,6.50,0.0,0.0,0.0
4,15,6.00,0.0,0.0,38.0
5,16,6.00,1.0,0.0,26.0
6,17,0.50,2.0,0.0,158.0
7,18,1.50,1.0,0.0,26.0
8,19,7.50,0.0,0.0,46.0
9,20,2.00,0.0,0.0,4.0


# Export dataset 
Here I export the dataset for being used by our classifiers

In [28]:
train_dataset.to_csv("../data/train_dataset.csv")