In [1]:
import pandas as pd
import numpy as np

In [2]:
df_raw = pd.read_csv("./data/train_logs.csv")
df_raw.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


**id** - The unique ID of the essay

**event_id** - The index of the event, ordered chronologically

**down_time** - The time of the down event in milliseconds

**up_time** - The time of the up event in milliseconds

**action_time** - The duration of the event (the difference between down_time and up_time)

**activity** - The category of activity which the event belongs to

- **Nonproduction** - The event does not alter the text in any way

- **Input** - The event adds text to the essay

- **Remove/Cut** - The event removes text from the essay

- **Paste** - The event changes the text through a paste input

- **Replace** - The event replaces a section of text with another string

- **Move From [x1, y1] To [x2, y2]** - The event moves a section of text spanning character index x1, y1 to a new location x2, y2

**down_event** - The name of the event when the key/mouse is pressed

**up_event** - The name of the event when the key/mouse is released

**text_change** - The text that changed as a result of the event (if any)

**cursor_position** - The character index of the text cursor after the event

**word_count** - The word count of the essay after the event

In [3]:
# Creating a new df based on the number of essays
# More features (columns) will be added to this df

df = pd.DataFrame({
    'id': df_raw["id"].unique()
})
df.head()

Unnamed: 0,id
0,001519c8
1,0022f953
2,0042269b
3,0059420b
4,0075873a


In [4]:
# Calculating the final word count for each essay
df_word = df_raw.groupby("id")['word_count'].max()
df = pd.merge(df, df_word, on="id", how="left")

In [5]:
# Turning milliseconds into minutes
df_raw['current_min'] = df_raw["down_time"]//60000

In [6]:
#Counting the number of events per essay
df_event = df_raw.groupby("id")['event_id'].count()

df = pd.merge(df, df_event, on="id", how="left")
df = df.rename(columns={"event_id": "event_count"})

In [7]:
# Calculating the time spend on writing the essay
df_time = df_raw.groupby("id")['current_min'].max()
df = pd.merge(df, df_time, on="id", how="left")
df = df.rename(columns={"current_min": "writing_time_min"})

In [8]:
# Calculating events per minute
df['events_per_min'] = df["event_count"] / df['writing_time_min']

In [9]:
# Calculating the number of TEXT changes made during the writing process
filter_text_change = df_raw["text_change"]!="NoChange"
df_filtered = df_raw[filter_text_change]
df_text_change = df_filtered.groupby("id")['text_change'].count()

df = pd.merge(df, df_text_change, on="id", how="left")

In [10]:
df = df.rename(columns={"text_change": "text_change_count"})

In [11]:
# Calculating text changes per minute
df['text_changes_per_min'] = df["text_change_count"] / df['writing_time_min']

In [12]:
# Calculating the number of sentences (based on the number of full stops)
sentence_filter = df_raw[(df_raw["text_change"].str.match('\.')) & (df_raw['activity'] != 'Remove/Cut')]
sentence_filter = df_raw[df_raw["up_event"] == '.']
df_sentence_count = sentence_filter.groupby("id")['text_change'].count()

df = pd.merge(df, df_sentence_count, on="id", how="left")

In [13]:
df = df.rename(columns={"text_change": "sentence_count"})

In [14]:
# Calculating total number of different activities
for activity in ['Input', 'Remove/Cut', 'Paste', 'Replace', 'Nonproduction']:
    df_activity_count = df_raw[df_raw['activity'] == activity].groupby("id")['activity'].count()
    column_name = f'{activity}_count'
    df_activity_count.name = column_name
    df = pd.merge(df, df_activity_count, on="id", how="left")
    df[column_name] = df[column_name].fillna(0)

In [15]:
# Pauses
df_raw['IKI'] = df_raw['down_time'] - df_raw.groupby('id')['up_time'].shift(1)
df_raw['IKI'] = df_raw['IKI'].fillna(0)
df_raw['IKI'] = df_raw['IKI'].clip(0)

PAUSE_THRESHOLD = 2000

# Total number of pauses (over 2000 ms) during writing process
pauses_count = df_raw[df_raw['IKI'] > PAUSE_THRESHOLD].groupby('id')['IKI'].count()
pauses_count.name = 'pauses_count'
df = pd.merge(df, pauses_count, on="id", how="left")

In [16]:
# Average duration of a pause
pauses_mean_duration = df_raw[df_raw['IKI'] > PAUSE_THRESHOLD].groupby('id')['IKI'].mean().round(0)
pauses_mean_duration.name = 'pause_mean_duration'
df = pd.merge(df, pauses_mean_duration, on="id", how="left")

In [17]:
# Calculating total time of pauses
pauses_time_sum = df_raw[df_raw['IKI'] > PAUSE_THRESHOLD].groupby('id')['IKI'].sum()
pauses_time_sum.name = 'pauses_time_sum'
df = pd.merge(df, pauses_time_sum, on="id", how="left")

In [18]:
# Calculating writing time in ms
writing_time_ms = df_raw.groupby('id')['up_time'].max()
writing_time_ms.name = 'writing_time_ms'
df = pd.merge(df, writing_time_ms, on="id", how="left")

In [19]:
# Average number of pauses per minute
df['pause_per_min'] = (df["pauses_count"] / df['writing_time_min']).round(2)

# Proportion of pauses during the writing process
df['pauses_share'] = (df["pauses_time_sum"] / df['writing_time_ms']).round(2)

In [20]:
df

Unnamed: 0,id,word_count,event_count,writing_time_min,events_per_min,text_change_count,text_changes_per_min,sentence_count,Input_count,Remove/Cut_count,Paste_count,Replace_count,Nonproduction_count,pauses_count,pause_mean_duration,pauses_time_sum,writing_time_ms,pause_per_min,pauses_share
0,001519c8,256,2557,30,85.233333,2437,81.233333,21.0,2010,417.0,0.0,7.0,120,124,9537.0,1182600.0,1801969,4.13,0.66
1,0022f953,323,2454,29,84.620690,2200,75.862069,15.0,1938,260.0,1.0,1.0,254,80,14678.0,1174268.0,1788969,2.76,0.66
2,0042269b,404,4136,29,142.620690,3961,136.586207,21.0,3515,439.0,0.0,7.0,175,77,14431.0,1111167.0,1771669,2.66,0.63
3,0059420b,206,1556,23,67.652174,1457,63.347826,13.0,1304,151.0,1.0,1.0,99,87,8828.0,768075.0,1404469,3.78,0.55
4,0075873a,252,2531,27,93.740741,2459,91.074074,23.0,1942,517.0,0.0,0.0,72,88,11880.0,1045463.0,1662472,3.26,0.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,461,4739,29,163.413793,4550,156.896552,43.0,3588,960.0,0.0,2.0,189,41,23605.0,967796.0,1791649,1.41,0.54
2467,ffbef7e5,438,2604,29,89.793103,2456,84.689655,31.0,2395,60.0,0.0,1.0,148,90,10771.0,969383.0,1799174,3.10,0.54
2468,ffccd6fd,201,3063,32,95.718750,2937,91.781250,5.0,2849,88.0,0.0,0.0,126,87,13101.0,1139801.0,1959363,2.72,0.58
2469,ffec5b38,413,3242,25,129.680000,3171,126.840000,31.0,2895,276.0,0.0,0.0,71,63,11720.0,738365.0,1508504,2.52,0.49


In [21]:
# calcullating the start and end time of writing and most frequent activity
df_freq = df_raw.groupby("id").agg({
    'down_time': ['min', 'max'],
    'activity': lambda x: x.value_counts().index[0]
}).reset_index()

In [22]:
df_freq.columns = ['id', 'start_time', 'end_time', 'most_frequent_activity']

In [23]:
df_freq

Unnamed: 0,id,start_time,end_time,most_frequent_activity
0,001519c8,4526,1801877,Input
1,0022f953,30623,1788842,Input
2,0042269b,4441,1771219,Input
3,0059420b,41395,1404394,Input
4,0075873a,78470,1662390,Input
...,...,...,...,...
2466,ffb8c745,22467,1791581,Input
2467,ffbef7e5,21732,1799124,Input
2468,ffccd6fd,23482,1959273,Input
2469,ffec5b38,19885,1508335,Input


In [24]:
df = pd.merge(df, df_freq, on="id", how='left')
df.head()

Unnamed: 0,id,word_count,event_count,writing_time_min,events_per_min,text_change_count,text_changes_per_min,sentence_count,Input_count,Remove/Cut_count,...,Nonproduction_count,pauses_count,pause_mean_duration,pauses_time_sum,writing_time_ms,pause_per_min,pauses_share,start_time,end_time,most_frequent_activity
0,001519c8,256,2557,30,85.233333,2437,81.233333,21.0,2010,417.0,...,120,124,9537.0,1182600.0,1801969,4.13,0.66,4526,1801877,Input
1,0022f953,323,2454,29,84.62069,2200,75.862069,15.0,1938,260.0,...,254,80,14678.0,1174268.0,1788969,2.76,0.66,30623,1788842,Input
2,0042269b,404,4136,29,142.62069,3961,136.586207,21.0,3515,439.0,...,175,77,14431.0,1111167.0,1771669,2.66,0.63,4441,1771219,Input
3,0059420b,206,1556,23,67.652174,1457,63.347826,13.0,1304,151.0,...,99,87,8828.0,768075.0,1404469,3.78,0.55,41395,1404394,Input
4,0075873a,252,2531,27,93.740741,2459,91.074074,23.0,1942,517.0,...,72,88,11880.0,1045463.0,1662472,3.26,0.63,78470,1662390,Input


In [25]:
# Calculating Average action time
df['total_writing_time'] = df['end_time'] - df['start_time']
df['Avg_time_between_events'] = df['total_writing_time'] / df['event_count']
df.head()

Unnamed: 0,id,word_count,event_count,writing_time_min,events_per_min,text_change_count,text_changes_per_min,sentence_count,Input_count,Remove/Cut_count,...,pause_mean_duration,pauses_time_sum,writing_time_ms,pause_per_min,pauses_share,start_time,end_time,most_frequent_activity,total_writing_time,Avg_time_between_events
0,001519c8,256,2557,30,85.233333,2437,81.233333,21.0,2010,417.0,...,9537.0,1182600.0,1801969,4.13,0.66,4526,1801877,Input,1797351,702.913962
1,0022f953,323,2454,29,84.62069,2200,75.862069,15.0,1938,260.0,...,14678.0,1174268.0,1788969,2.76,0.66,30623,1788842,Input,1758219,716.47066
2,0042269b,404,4136,29,142.62069,3961,136.586207,21.0,3515,439.0,...,14431.0,1111167.0,1771669,2.66,0.63,4441,1771219,Input,1766778,427.170696
3,0059420b,206,1556,23,67.652174,1457,63.347826,13.0,1304,151.0,...,8828.0,768075.0,1404469,3.78,0.55,41395,1404394,Input,1362999,875.963368
4,0075873a,252,2531,27,93.740741,2459,91.074074,23.0,1942,517.0,...,11880.0,1045463.0,1662472,3.26,0.63,78470,1662390,Input,1583920,625.807981


In [26]:
# If overtime writing (if the writing time exceeded 30 minutes)
df['overtime_writing'] = df['writing_time_min'] > 30

In [27]:
# Average Action Time - average duration of the actions for each essay.
# Summing the durations of all actions in an essay and then dividing by the total number of actions.
# I leave values in the milliseconds due to it is very short in duration

total_action_time = df_raw.groupby('id')['action_time'].sum()
df = pd.merge(df, total_action_time, on='id', how='left')

df['average_action_time'] = df['action_time'] / df['event_count']
df = df.drop(columns=['action_time'])

In [28]:
# Total num of characters in the essay, including spaces etc:

# 'Input' and 'Paste' events, count each event as one character added
# "Remove/Cut" event as one character removed.
# I think the "Replace" action does not affect the total character count, as it's essentially a one-for-one substitution.

df_raw['chars_added'] = df_raw['activity'].apply(lambda x: 1 if x in ['Input', 'Paste'] else 0)
df_raw['chars_removed'] = df_raw['activity'].apply(lambda x: 1 if x == 'Remove/Cut' else 0)

total_chars_added = df_raw.groupby('id')['chars_added'].sum()
total_chars_removed = df_raw.groupby('id')['chars_removed'].sum()

df = pd.merge(df, total_chars_added, on='id', how='left')
df = pd.merge(df, total_chars_removed, on='id', how='left')

df['total_characters'] = df['chars_added'] - df['chars_removed']
df.drop(columns=['chars_added', 'chars_removed'], inplace=True)

In [29]:
# Average Characters per Minute - dividion of the total number of characters by the total writing time in minutes
# I left these values in float type, not rounded to the int, to keep the precision, but it can be converted to the int if needed
df['avg_characters_per_min'] = df['total_characters'] / df['writing_time_min']
df.head()

Unnamed: 0,id,word_count,event_count,writing_time_min,events_per_min,text_change_count,text_changes_per_min,sentence_count,Input_count,Remove/Cut_count,...,pauses_share,start_time,end_time,most_frequent_activity,total_writing_time,Avg_time_between_events,overtime_writing,average_action_time,total_characters,avg_characters_per_min
0,001519c8,256,2557,30,85.233333,2437,81.233333,21.0,2010,417.0,...,0.66,4526,1801877,Input,1797351,702.913962,False,116.246774,1593,53.1
1,0022f953,323,2454,29,84.62069,2200,75.862069,15.0,1938,260.0,...,0.66,30623,1788842,Input,1758219,716.47066,False,112.221271,1679,57.896552
2,0042269b,404,4136,29,142.62069,3961,136.586207,21.0,3515,439.0,...,0.63,4441,1771219,Input,1766778,427.170696,False,101.837766,3076,106.068966
3,0059420b,206,1556,23,67.652174,1457,63.347826,13.0,1304,151.0,...,0.55,41395,1404394,Input,1362999,875.963368,False,121.848329,1154,50.173913
4,0075873a,252,2531,27,93.740741,2459,91.074074,23.0,1942,517.0,...,0.63,78470,1662390,Input,1583920,625.807981,False,123.943896,1425,52.777778


# **Multicorrelation**

In [30]:
df_labels = pd.read_csv("./data/train_scores.csv")
df = pd.merge(df, df_labels, on='id', how='left')


In [31]:
df.head()

Unnamed: 0,id,word_count,event_count,writing_time_min,events_per_min,text_change_count,text_changes_per_min,sentence_count,Input_count,Remove/Cut_count,...,start_time,end_time,most_frequent_activity,total_writing_time,Avg_time_between_events,overtime_writing,average_action_time,total_characters,avg_characters_per_min,score
0,001519c8,256,2557,30,85.233333,2437,81.233333,21.0,2010,417.0,...,4526,1801877,Input,1797351,702.913962,False,116.246774,1593,53.1,3.5
1,0022f953,323,2454,29,84.62069,2200,75.862069,15.0,1938,260.0,...,30623,1788842,Input,1758219,716.47066,False,112.221271,1679,57.896552,3.5
2,0042269b,404,4136,29,142.62069,3961,136.586207,21.0,3515,439.0,...,4441,1771219,Input,1766778,427.170696,False,101.837766,3076,106.068966,6.0
3,0059420b,206,1556,23,67.652174,1457,63.347826,13.0,1304,151.0,...,41395,1404394,Input,1362999,875.963368,False,121.848329,1154,50.173913,2.0
4,0075873a,252,2531,27,93.740741,2459,91.074074,23.0,1942,517.0,...,78470,1662390,Input,1583920,625.807981,False,123.943896,1425,52.777778,4.0


In [32]:
#Pearson correlation matrix

matrix = df.corr()
matrix


  matrix = df.corr()


Unnamed: 0,word_count,event_count,writing_time_min,events_per_min,text_change_count,text_changes_per_min,sentence_count,Input_count,Remove/Cut_count,Paste_count,...,pauses_share,start_time,end_time,total_writing_time,Avg_time_between_events,overtime_writing,average_action_time,total_characters,avg_characters_per_min,score
word_count,1.0,0.801048,0.043363,0.767413,0.852903,0.808038,0.717335,0.922701,0.326836,0.049686,...,-0.517431,-0.081238,0.045839,0.105272,-0.644409,-0.035874,-0.054972,0.964983,0.885028,0.635948
event_count,0.801048,1.0,0.107846,0.932891,0.949,0.873101,0.672225,0.929272,0.712464,0.04517,...,-0.623569,-0.096051,0.108531,0.184902,-0.752627,0.018104,-0.166823,0.848116,0.749144,0.590769
writing_time_min,0.043363,0.107846,1.0,-0.170657,0.096823,-0.189232,0.068898,0.082463,0.116487,0.005996,...,0.080535,0.459837,0.997523,0.802278,0.167552,0.401424,-0.012288,0.058069,-0.239333,0.021556
events_per_min,0.767413,0.932891,-0.170657,1.0,0.88919,0.951885,0.634233,0.876203,0.648027,0.042465,...,-0.661923,-0.119585,-0.169109,-0.108192,-0.796024,-0.079862,-0.160299,0.807348,0.857168,0.572806
text_change_count,0.852903,0.949,0.096823,0.88919,1.0,0.927472,0.69785,0.98235,0.739912,0.042662,...,-0.619109,-0.096413,0.098008,0.173439,-0.735131,0.007513,-0.105449,0.900828,0.803225,0.610146
text_changes_per_min,0.808038,0.873101,-0.189232,0.951885,0.927472,1.0,0.651913,0.916605,0.666706,0.039959,...,-0.653532,-0.119514,-0.187202,-0.128365,-0.773279,-0.090058,-0.098781,0.848186,0.907571,0.584598
sentence_count,0.717335,0.672225,0.068898,0.634233,0.69785,0.651913,1.0,0.723481,0.380592,0.039848,...,-0.455678,-0.070832,0.071212,0.126541,-0.532014,-0.022321,-0.049407,0.71662,0.647562,0.438797
Input_count,0.922701,0.929272,0.082463,0.876203,0.98235,0.916605,0.723481,1.0,0.601039,0.046006,...,-0.606202,-0.095602,0.084029,0.157348,-0.727669,-0.003069,-0.089933,0.966139,0.870202,0.643891
Remove/Cut_count,0.326836,0.712464,0.116487,0.648027,0.739912,0.666706,0.380592,0.601039,1.0,0.012376,...,-0.465483,-0.068012,0.115919,0.174384,-0.523695,0.042398,-0.127228,0.374471,0.302723,0.290776
Paste_count,0.049686,0.04517,0.005996,0.042465,0.042662,0.039959,0.039848,0.046006,0.012376,1.0,...,-0.055578,0.00576,0.004636,0.001308,-0.037851,0.010601,-0.015522,0.050543,0.04625,0.033638



Multicollinearity is indicated by the fact that the correlation between some pairs of features is stronger than their correlation with the dependent variable (score). For example the matrix shows the maximum
correlation between the characteristics total_writing_time and writing_time_min (0.99) and it is greater than the correlation of either argument with the investigated characteristic, so we can draw the conclusion that these 2 characteristics are too strongly related to each other and in the same model both should not appear at the same time.

To simplify the task we will choose a treshold of 0.9, considering that to be a very strong correlation.

We will start with writing down pairs of features that are highly correlated.

#total_characters - word_count

events_per_min - event_count

text_change_count - event_count

Input_count - event_count

#end_time - writing_time_min

#writing_time_ms - writing_time_min

text_changes_per_min - events_per_min

total_characters - text_change_count

text_changes_per_min - text_change_count

Input_count - text_change_count

text_changes_per_min - Input_count

text_changes_per_min - avg_characters_per_min

#Input_count - word_count

Input_count - total_characters

#writing_time_ms - end_time

#total_characters - avg_characters_per_min

#pause_per_min - pause_count


In [33]:
import copy
df_model = copy.deepcopy(df)


In [34]:
#We will drop Input_count, writing_time_min, end_time, avg_characters_per_min, pause_per_min,
#events_per_min, total_characters,text_change_count.


#Drop one feature from the pair.

columns_to_drop = ['Input_count', 'writing_time_min', 'end_time', 'avg_characters_per_min',  'pause_per_min', 'events_per_min', 'total_characters', 'text_change_count']
df_model.drop(columns=columns_to_drop , inplace=True)
df_model.corr()



  df_model.corr()


Unnamed: 0,word_count,event_count,text_changes_per_min,sentence_count,Remove/Cut_count,Paste_count,Replace_count,Nonproduction_count,pauses_count,pause_mean_duration,pauses_time_sum,writing_time_ms,pauses_share,start_time,total_writing_time,Avg_time_between_events,overtime_writing,average_action_time,score
word_count,1.0,0.801048,0.808038,0.717335,0.326836,0.049686,0.105441,0.102451,0.015488,-0.23175,-0.403282,0.046095,-0.517431,-0.081238,0.105272,-0.644409,-0.035874,-0.054972,0.635948
event_count,0.801048,1.0,0.873101,0.672225,0.712464,0.04517,0.229385,0.45482,0.16217,-0.323404,-0.461638,0.108684,-0.623569,-0.096051,0.184902,-0.752627,0.018104,-0.166823,0.590769
text_changes_per_min,0.808038,0.873101,1.0,0.651913,0.666706,0.039959,0.167368,0.117737,0.016333,-0.330331,-0.611577,-0.187051,-0.653532,-0.119514,-0.128365,-0.773279,-0.090058,-0.098781,0.584598
sentence_count,0.717335,0.672225,0.651913,1.0,0.380592,0.039848,0.109955,0.136633,0.057748,-0.224486,-0.337745,0.071457,-0.455678,-0.070832,0.126541,-0.532014,-0.022321,-0.049407,0.438797
Remove/Cut_count,0.326836,0.712464,0.666706,0.380592,1.0,0.012376,0.121154,0.143867,0.21776,-0.270896,-0.326896,0.115913,-0.465483,-0.068012,0.174384,-0.523695,0.042398,-0.127228,0.290776
Paste_count,0.049686,0.04517,0.039959,0.039848,0.012376,1.0,0.087775,0.021122,0.022311,-0.03318,-0.043121,0.004605,-0.055578,0.00576,0.001308,-0.037851,0.010601,-0.015522,0.033638
Replace_count,0.105441,0.229385,0.167368,0.109955,0.121154,0.087775,1.0,0.144756,0.254517,-0.14884,-0.038076,0.073827,-0.090996,-0.033816,0.10477,-0.1867,0.062119,-0.008482,0.186259
Nonproduction_count,0.102451,0.45482,0.117737,0.136633,0.143867,0.021122,0.144756,1.0,0.083941,-0.107784,-0.144461,0.06348,-0.206377,-0.028826,0.089839,-0.283309,0.035542,-0.225209,0.128851
pauses_count,0.015488,0.16217,0.016333,0.057748,0.21776,0.022311,0.254517,0.083941,1.0,-0.606652,-0.071044,0.296913,-0.260803,-0.00041,0.33053,-0.131413,0.126449,-0.084425,0.097238
pause_mean_duration,-0.23175,-0.323404,-0.330331,-0.224486,-0.270896,-0.03318,-0.14884,-0.107784,-0.606652,1.0,0.550616,0.086853,0.592782,-0.046102,0.127407,0.471069,0.109446,0.060685,-0.190997


In [44]:
df_model = df_model.drop(columns='score')
df_model.head()

Unnamed: 0,id,word_count,event_count,text_changes_per_min,sentence_count,Remove/Cut_count,Paste_count,Replace_count,Nonproduction_count,pauses_count,pause_mean_duration,pauses_time_sum,writing_time_ms,pauses_share,start_time,most_frequent_activity,total_writing_time,Avg_time_between_events,overtime_writing,average_action_time
0,001519c8,256,2557,81.233333,21.0,417.0,0.0,7.0,120,124,9537.0,1182600.0,1801969,0.66,4526,Input,1797351,702.913962,False,116.246774
1,0022f953,323,2454,75.862069,15.0,260.0,1.0,1.0,254,80,14678.0,1174268.0,1788969,0.66,30623,Input,1758219,716.47066,False,112.221271
2,0042269b,404,4136,136.586207,21.0,439.0,0.0,7.0,175,77,14431.0,1111167.0,1771669,0.63,4441,Input,1766778,427.170696,False,101.837766
3,0059420b,206,1556,63.347826,13.0,151.0,1.0,1.0,99,87,8828.0,768075.0,1404469,0.55,41395,Input,1362999,875.963368,False,121.848329
4,0075873a,252,2531,91.074074,23.0,517.0,0.0,0.0,72,88,11880.0,1045463.0,1662472,0.63,78470,Input,1583920,625.807981,False,123.943896


In [39]:
#Aleksandra
# Activity column group by activities
# Pauses (total number, per minute, average)

#Yaroslava:
# Number of characters (total in the essay, avg per minute)
# Overtime (yes or no?)
# Average action time

#Ehab
# Most frequent activity
# Average time between events

#Anastastija
# Move From explore futher..
# Number of sentences based on the number of full stops/excl. marks

# Train labels (essay grades)


In [40]:
df_labels = pd.read_csv("../data/train_scores.csv")
df_labels.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/train_scores.csv'

In [None]:
import matplotlib.pyplot as plt

plt.hist(df_labels['score'], bins = 12)
plt.show()

In [None]:
df_labels.groupby('score')['id'].count().plot.bar(width =0.8)