In [None]:
import pandas as pd
import numpy as np

In [None]:
df_raw = pd.read_csv("../data/train_logs.csv")
df_raw.head()

**id** - The unique ID of the essay

**event_id** - The index of the event, ordered chronologically

**down_time** - The time of the down event in milliseconds

**up_time** - The time of the up event in milliseconds

**action_time** - The duration of the event (the difference between down_time and up_time)

**activity** - The category of activity which the event belongs to

- **Nonproduction** - The event does not alter the text in any way

- **Input** - The event adds text to the essay

- **Remove/Cut** - The event removes text from the essay

- **Paste** - The event changes the text through a paste input

- **Replace** - The event replaces a section of text with another string

- **Move From [x1, y1] To [x2, y2]** - The event moves a section of text spanning character index x1, y1 to a new location x2, y2

**down_event** - The name of the event when the key/mouse is pressed

**up_event** - The name of the event when the key/mouse is released

**text_change** - The text that changed as a result of the event (if any)

**cursor_position** - The character index of the text cursor after the event

**word_count** - The word count of the essay after the event

In [None]:
# Creating a new df based on the number of essays
# More features (columns) will be added to this df

df = pd.DataFrame({
    'id': df_raw["id"].unique()
})
df.head()

In [None]:
# Calculating the final word count for each essay
df_word = df_raw.groupby("id")['word_count'].max()
df = pd.merge(df, df_word, on="id", how="left")

In [None]:
# Turning milliseconds into minutes
df_raw['current_min'] = df_raw["down_time"]//60000

In [None]:
#Counting the number of events per essay
df_event = df_raw.groupby("id")['event_id'].count()

df = pd.merge(df, df_event, on="id", how="left")
df = df.rename(columns={"event_id": "event_count"})

In [None]:
# Calculating the time spend on writing the essay
df_time = df_raw.groupby("id")['current_min'].max()
df = pd.merge(df, df_time, on="id", how="left")
df = df.rename(columns={"current_min": "writing_time_min"})

In [None]:
# Calculating events per minute
df['events_per_min'] = df["event_count"] / df['writing_time_min']

In [None]:
# Calculating the number of TEXT changes made during the writing process
filter_text_change = df_raw["text_change"]!="NoChange"
df_filtered = df_raw[filter_text_change]
df_text_change = df_filtered.groupby("id")['text_change'].count()

df = pd.merge(df, df_text_change, on="id", how="left")

In [None]:
df = df.rename(columns={"text_change": "text_change_count"})

In [None]:
# Calculating text changes per minute
df['text_changes_per_min'] = df["text_change_count"] / df['writing_time_min']

In [None]:
df

In [None]:
# calcullating the start and end time of writing and most frequent activity
df_freq = df_raw.groupby("id").agg({
    'down_time': ['min', 'max'],
    'activity': lambda x: x.value_counts().index[0]  
}).reset_index()

In [None]:
df_freq.columns = ['id', 'start_time', 'end_time', 'most_frequent_activity']

In [None]:
df_freq

In [None]:
df = pd.merge(df, df_freq, on="id", how='left')
df.head()

In [None]:
# Calculating Average action time
df['total_writing_time'] = df['end_time'] - df['start_time']
df['Avg_time_between_events'] = df['total_writing_time'] / df['event_count']
df.head()

# Train labels (essay grades)


In [None]:
df_labels = pd.read_csv("../data/train_scores.csv")
df_labels.head()

In [None]:
import matplotlib.pyplot as plt

plt.hist(df_labels['score'], bins = 12)
plt.show()

In [None]:
df_labels.groupby('score')['id'].count().plot.bar(width =0.8)