O objetivo deste notebook é criar o dataset final para teste e treinamento da classificação entre sucesso ou insucesso do aluno (sucesso = passar por média, acima ou igual à 7,0).

As features deste modelo são baseadas em notificações e a quantidade pendências apenas

In [1]:
import pandas as pd
from pandas.io.json import json_normalize
%matplotlib inline
from datetime import datetime, timedelta

# Load de dados

In [2]:
df = pd.read_json("../../../data/raw_data/base_22012019.json")
grades_df = pd.read_excel("../../../data/raw_data/nota_provas.xls")

In [3]:
date_v1 = datetime(day=10, year=2018, month=12)
date_v2 = datetime(day=11, month=2, year=2019)
date_v3 = datetime(day=18, month=3, year=2019)

# Definição de datasets principais

In [4]:
notifications_df = df[df["model"] == "notifications.notification"]
resources_df = df[df["model"] == "topics.resource"]
pendencies_df = df[df["model"] == "pendencies.pendencies"]

In [5]:
def extract_flatten_dataframe(df, column, meta_list):
    df_fields = json_normalize(data=df[column], meta=meta_list)
    df_fields.index = df.index
    return df_fields.join(df, how="outer")

# Limpeza dos dados

In [6]:
def clean_notifications_df(dirty_notifications_df):
    notification_fields = json_normalize(data=dirty_notifications_df["fields"], meta=["meta", "task", "user", "level", "viewed", "creation_date"] )
    notification_fields.index = dirty_notifications_df.index
    notification_fields = notification_fields.astype({"level": pd.CategoricalDtype()})
    notification_fields["creation_date"] = pd.to_datetime(notification_fields["creation_date"])
    notification_df_flatted = notification_fields.join(dirty_notifications_df, how="outer")
    return notification_df_flatted

In [7]:
def clean_resources_df(dirty_resources_df):
    resource_meta_list = ["_my_subclass", "visible", "students", "groups", "tags", "all_students", "order", "topic", 
             "show_window", "brief_description", "slug", "name"]
    resources_flatten_df = extract_flatten_dataframe(dirty_resources_df, "fields", resource_meta_list).drop("fields", axis=1)
    return resources_flatten_df

In [8]:
def clean_pendencies_df(dirty_pendencies_df):
    pendencies_meta_list = ["action", "begin_date", "end_date", "limit_date", "resource"]
    flattned_dirty_pendencies_df = extract_flatten_dataframe(dirty_pendencies_df, "fields", pendencies_meta_list)
    date_columns = ["begin_date", "end_date", "limit_date"]
    for column in date_columns:
        flattned_dirty_pendencies_df[column] = pd.to_datetime(flattned_dirty_pendencies_df[column])
        flattned_dirty_pendencies_df[column] = flattned_dirty_pendencies_df[column].apply(lambda timestamp: timestamp.replace(tzinfo=None))
    flattned_dirty_pendencies_df["action"] = flattned_dirty_pendencies_df["action"].astype("category")
    return flattned_dirty_pendencies_df

In [9]:
cleaned_notification_df = clean_notifications_df(notifications_df)
cleaned_pendencies_df = clean_pendencies_df(pendencies_df)
cleaned_resources_df = clean_resources_df(resources_df)

# Junções para os datasets

In [10]:
notifications_with_user_grades_df = pd.merge(cleaned_notification_df, grades_df, left_on=["user"], right_on=["id_estudante"])
pendencies_with_resources = pd.merge(cleaned_pendencies_df, cleaned_resources_df, left_on=["resource"], right_on=["pk"])
notications_with_resources_pendencies = pd.merge(notifications_with_user_grades_df, pendencies_with_resources, left_on=["task"], right_on=["pk_x"])
v1_notifications = notications_with_resources_pendencies[notications_with_resources_pendencies["end_date"] <= date_v1]\
.drop(columns=["v2"])
v1_notifications.index = v1_notifications["creation_date"]
user_df = pd.DataFrame(v1_notifications[["user", "v1"]])
user_df = user_df.rename(columns={0: "user"})

In [11]:
user_df = user_df.drop_duplicates()

# Adicionar Features no dataset

In [12]:
v1_notifications["simulado_delayed"] = v1_notifications["name"].str.contains("Simulado") & (v1_notifications["level"] == 3)
v1_notifications["simulado_incompleted"] = v1_notifications["name"].str.contains("Simulado") & (v1_notifications["level"] == 4)
v1_notifications["notes_delayed"] = v1_notifications["name"].str.contains("Aula") & (v1_notifications["level"] == 3)

In [13]:
def set_weeks_apart(df, date, date_column, week_diff_column):
    """
        Create a new column week_diff_column that contains the amount of weeks between the dates on the date_column
        of a given df to a related date
    """
    return df[date_column].apply(lambda dt: (date - dt).days//7)

def create_user_notification_count_features(dirty_v1_notifications_df, v1_date):
    v1_notifications_viewed_date = v1_notifications.groupby(["user", pd.Grouper(freq="W")])\
    .agg({"viewed": ["mean", "sum"]}).reset_index()
    viewed_df = v1_notifications_viewed_date[["user", "creation_date"]]
    viewed_df["mean"] = v1_notifications_viewed_date["viewed"]["mean"]
    viewed_df["count"] = v1_notifications_viewed_date["viewed"]["sum"]
    viewed_df["amount_of_weeks_prior_v1"] = set_weeks_apart(viewed_df, v1_date, "creation_date", "test")
    
    viewed_df = viewed_df.rename(columns={"count": "notifications_viewed", "mean": "average_amount_viewed"})
    
    return viewed_df

In [14]:
time_related_columns = ["user", "v1", "creation_date", "viewed"]
time_features_related_df = create_user_notification_count_features(v1_notifications[time_related_columns], date_v1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [24]:
time_features_related_df.columns

MultiIndex(levels=[['amount_of_weeks_prior_v1', 'average_amount_viewed', 'creation_date', 'notifications_viewed', 'user'], ['']],
           codes=[[4, 2, 1, 3, 0], [0, 0, 0, 0, 0]])

In [28]:
pd.pivot_table(time_features_related_df, index=["user"], columns=["amount_of_weeks_prior_v1"], values=["average_amount_viewed"])

  new_axis = axis.drop(labels, errors=errors)


Unnamed: 0_level_0,average_amount_viewed,average_amount_viewed,average_amount_viewed,average_amount_viewed,average_amount_viewed,average_amount_viewed,average_amount_viewed,average_amount_viewed,average_amount_viewed,average_amount_viewed,average_amount_viewed
amount_of_weeks_prior_v1,-7,-6,-5,-4,-2,-1,0,1,2,3,4
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
11,0.0,0.000000,0.0,0.0,0.0,0.0,0.157895,0.000000,0.147541,0.000000,0.000000
12,,,,,,,,0.200000,0.142857,0.714286,0.625000
13,0.5,0.000000,0.0,0.0,0.0,0.0,0.142857,0.545455,0.083333,0.562500,0.444444
14,0.0,0.285714,0.0,0.0,0.0,0.2,0.428571,0.142857,0.600000,0.666667,0.333333
15,0.0,0.000000,0.0,0.0,0.0,0.0,0.142857,0.250000,0.521739,0.145833,0.000000
16,0.0,0.000000,0.0,0.0,0.0,0.2,0.375000,0.393939,0.156250,0.187500,0.375000
17,0.5,0.000000,0.0,0.0,0.0,0.0,0.780488,0.000000,0.000000,0.200000,0.071429
18,,,,,,,0.428571,0.000000,0.000000,0.500000,0.277778
19,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.157895,0.000000,0.800000
20,,,,,,,1.000000,0.250000,0.166667,0.500000,0.285714


In [112]:
time_features_df = pd.pivot_table(time_features_related_df, index=["user"], \
                                  columns=["amount_of_weeks_prior_v1"], values=["notifications_viewed"], \
                                  fill_value=0)

  new_axis = axis.drop(labels, errors=errors)


In [113]:
time_features_df.columns

MultiIndex(levels=[['amount_of_weeks_prior_v1', 'average_amount_viewed', 'creation_date', 'notifications_viewed', 'user'], [''], [-7, -6, -5, -4, -2, -1, 0, 1, 2, 3, 4]],
           codes=[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]],
           names=[None, None, 'amount_of_weeks_prior_v1'])

In [114]:
time_features_df

Unnamed: 0_level_0,notifications_viewed,notifications_viewed,notifications_viewed,notifications_viewed,notifications_viewed,notifications_viewed,notifications_viewed,notifications_viewed,notifications_viewed,notifications_viewed,notifications_viewed
amount_of_weeks_prior_v1,-7,-6,-5,-4,-2,-1,0,1,2,3,4
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
11,0,0,0,0,0,0,9,0,9,0,0
12,0,0,0,0,0,0,0,1,1,5,5
13,1,0,0,0,0,0,1,6,2,9,4
14,0,2,0,0,0,1,3,1,6,6,3
15,0,0,0,0,0,0,1,3,12,7,0
16,0,0,0,0,0,1,3,13,5,3,3
17,3,0,0,0,0,0,32,0,0,7,1
18,0,0,0,0,0,0,3,0,0,5,5
19,0,0,0,0,0,0,0,0,3,0,4
20,0,0,0,0,0,0,1,1,1,5,4


In [87]:
time_features_df["notifications_viewed"][""][3]

user
11     0.0
12     5.0
13     9.0
14     6.0
15     7.0
16     3.0
17     7.0
18     5.0
19     0.0
20     5.0
21     1.0
22    11.0
23    10.0
24     7.0
25     4.0
26     2.0
27     6.0
28     3.0
29     0.0
30     7.0
31     8.0
32     7.0
33     2.0
34     7.0
35     3.0
36     4.0
37     4.0
38     7.0
39     5.0
40     0.0
      ... 
59     8.0
60     4.0
61     1.0
62     NaN
63     8.0
64     2.0
65    16.0
66     9.0
67     4.0
68     3.0
69     0.0
70     5.0
71     2.0
72     0.0
73     6.0
74     5.0
75    12.0
76     7.0
77     5.0
78     8.0
79     8.0
80     9.0
81     3.0
82     8.0
83     4.0
84     0.0
85     4.0
86     5.0
87    11.0
88     1.0
Name: 3, Length: 78, dtype: float64

In [115]:
time_features_related_df.columns

MultiIndex(levels=[['amount_of_weeks_prior_v1', 'average_amount_viewed', 'creation_date', 'notifications_viewed', 'user'], ['']],
           codes=[[4, 2, 1, 3, 0], [0, 0, 0, 0, 0]])

In [116]:
time_features_df.index

Int64Index([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
            28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
            45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
            62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
            79, 80, 81, 82, 83, 84, 85, 86, 87, 88],
           dtype='int64', name='user')

In [117]:
time_features_final_df = pd.DataFrame(time_features_related_df["user"].unique())
time_features_final_df = time_features_final_df.rename(columns={0:"user"})
time_features_final_df.index = time_features_final_df["user"]

In [118]:
time_features_final_df.index

Int64Index([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
            28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
            45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
            62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
            79, 80, 81, 82, 83, 84, 85, 86, 87, 88],
           dtype='int64', name='user')

In [119]:
time_features_final_df["notifications_viewed_three_weeks_ago"] = time_features_df["notifications_viewed"][""][3]
time_features_final_df["notifications_viewed_four_weeks_ago"] = time_features_df["notifications_viewed"][""][4]
time_features_final_df["notifications_viewed_two_weeks_ago"] = time_features_df["notifications_viewed"][""][2]
time_features_final_df["notifications_viewed_one_week_ago"] = time_features_df["notifications_viewed"][""][1]
time_features_final_df["notifications_viewed_zero_weeks_ago"] = time_features_df["notifications_viewed"][""][0]

In [120]:
time_features_final_df = time_features_final_df.drop("user", axis=1)
time_features_final_df.head(10)

Unnamed: 0_level_0,notifications_viewed_three_weeks_ago,notifications_viewed_four_weeks_ago,notifications_viewed_two_weeks_ago,notifications_viewed_one_week_ago,notifications_viewed_zero_weeks_ago
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11,0,0,9,0,9
12,5,5,1,1,0
13,9,4,2,6,1
14,6,3,6,1,3
15,7,0,12,3,1
16,3,3,5,13,3
17,7,1,0,0,32
18,5,5,0,0,3
19,0,4,3,0,0
20,5,4,1,1,1


In [121]:
def notifications_level_features(notifications_dataset):
    temp_df = notifications_dataset.groupby("user").agg({"simulado_delayed": ["sum"], "simulado_incompleted": ["sum"], 
                                               "notes_delayed": ["sum"], "v1": ["first"]}).reset_index()
    features_df = temp_df["user"]
    features_df = pd.DataFrame(features_df)
    features_df["simulado_delayed_amount"] = temp_df["simulado_delayed"]["sum"]
    features_df["simulado_incompleted_amount"] = temp_df["simulado_incompleted"]["sum"]
    features_df["notes_delayed_amount"] = temp_df["notes_delayed"]["sum"]
    return features_df
    

In [122]:
level_columns = ["user", "v1", "simulado_delayed", "simulado_incompleted", "notes_delayed"]
notification_level_features_df = notifications_level_features(v1_notifications[level_columns])

In [123]:
train_dataset = pd.merge(user_df, notification_level_features_df, left_on=["user"], right_on=["user"])


In [124]:
train_dataset = pd.merge(time_features_final_df, train_dataset, left_on=["user"], right_on=["user"])

In [125]:
train_dataset.head(10)

Unnamed: 0,user,notifications_viewed_three_weeks_ago,notifications_viewed_four_weeks_ago,notifications_viewed_two_weeks_ago,notifications_viewed_one_week_ago,notifications_viewed_zero_weeks_ago,v1,simulado_delayed_amount,simulado_incompleted_amount,notes_delayed_amount
0,11,0,0,9,0,9,0.0,4.0,52.0,152.0
1,12,5,5,1,1,0,4.0,0.0,0.0,1.0
2,13,9,4,2,6,1,6.5,0.0,0.0,10.0
3,14,6,3,6,1,3,6.5,0.0,0.0,0.0
4,15,7,0,12,3,1,6.0,0.0,0.0,38.0
5,16,3,3,5,13,3,6.0,1.0,0.0,26.0
6,17,7,1,0,0,32,0.5,2.0,0.0,158.0
7,18,5,5,0,0,3,1.5,1.0,0.0,26.0
8,19,0,4,3,0,0,7.5,0.0,0.0,46.0
9,20,5,4,1,1,1,2.0,0.0,0.0,4.0


In [126]:
train_dataset.head(10)

Unnamed: 0,user,notifications_viewed_three_weeks_ago,notifications_viewed_four_weeks_ago,notifications_viewed_two_weeks_ago,notifications_viewed_one_week_ago,notifications_viewed_zero_weeks_ago,v1,simulado_delayed_amount,simulado_incompleted_amount,notes_delayed_amount
0,11,0,0,9,0,9,0.0,4.0,52.0,152.0
1,12,5,5,1,1,0,4.0,0.0,0.0,1.0
2,13,9,4,2,6,1,6.5,0.0,0.0,10.0
3,14,6,3,6,1,3,6.5,0.0,0.0,0.0
4,15,7,0,12,3,1,6.0,0.0,0.0,38.0
5,16,3,3,5,13,3,6.0,1.0,0.0,26.0
6,17,7,1,0,0,32,0.5,2.0,0.0,158.0
7,18,5,5,0,0,3,1.5,1.0,0.0,26.0
8,19,0,4,3,0,0,7.5,0.0,0.0,46.0
9,20,5,4,1,1,1,2.0,0.0,0.0,4.0


In [127]:
train_dataset.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
            51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
            68, 69, 70, 71, 72, 73, 74, 75, 76, 77],
           dtype='int64')

# Export dataset 
Here I export the dataset for being used by our classifiers

In [128]:
train_dataset.to_csv("../data/train_dataset.csv")