In [None]:
import pandas as pd
import numpy as np

In [None]:
df_raw = pd.read_csv("/content/train_logs.csv")
df_raw.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31.0,Nonproduction,Leftclick,Leftclick,NoChange,0.0,0.0
1,001519c8,2,4558,4962,404.0,Nonproduction,Leftclick,Leftclick,NoChange,0.0,0.0
2,001519c8,3,106571,106571,0.0,Nonproduction,Shift,Shift,NoChange,0.0,0.0
3,001519c8,4,106686,106777,91.0,Input,q,q,q,1.0,1.0
4,001519c8,5,107196,107323,127.0,Input,q,q,q,2.0,1.0


**id** - The unique ID of the essay

**event_id** - The index of the event, ordered chronologically

**down_time** - The time of the down event in milliseconds

**up_time** - The time of the up event in milliseconds

**action_time** - The duration of the event (the difference between down_time and up_time)

**activity** - The category of activity which the event belongs to

- **Nonproduction** - The event does not alter the text in any way

- **Input** - The event adds text to the essay

- **Remove/Cut** - The event removes text from the essay

- **Paste** - The event changes the text through a paste input

- **Replace** - The event replaces a section of text with another string

- **Move From [x1, y1] To [x2, y2]** - The event moves a section of text spanning character index x1, y1 to a new location x2, y2

**down_event** - The name of the event when the key/mouse is pressed

**up_event** - The name of the event when the key/mouse is released

**text_change** - The text that changed as a result of the event (if any)

**cursor_position** - The character index of the text cursor after the event

**word_count** - The word count of the essay after the event

In [None]:
# Creating a new df based on the number of essays
# More features (columns) will be added to this df

df = pd.DataFrame({
    'id': df_raw["id"].unique()
})
df.head()

Unnamed: 0,id
0,001519c8
1,0022f953
2,0042269b
3,0059420b
4,0075873a


In [None]:
# Calculating the final word count for each essay
df_word = df_raw.groupby("id")['word_count'].max()
df = pd.merge(df, df_word, on="id", how="left")

In [None]:
# Turning milliseconds into minutes
df_raw['current_min'] = df_raw["down_time"]//60000

In [None]:
#Counting the number of events per essay
df_event = df_raw.groupby("id")['event_id'].count()

df = pd.merge(df, df_event, on="id", how="left")
df = df.rename(columns={"event_id": "event_count"})

In [None]:
# Calculating the time spend on writing the essay
df_time = df_raw.groupby("id")['current_min'].max()
df = pd.merge(df, df_time, on="id", how="left")
df = df.rename(columns={"current_min": "writing_time_min"})

In [None]:
# Calculating events per minute
df['events_per_min'] = df["event_count"] / df['writing_time_min']

In [None]:
# Calculating the number of TEXT changes made during the writing process
filter_text_change = df_raw["text_change"]!="NoChange"
df_filtered = df_raw[filter_text_change]
df_text_change = df_filtered.groupby("id")['text_change'].count()

df = pd.merge(df, df_text_change, on="id", how="left")

In [None]:
df = df.rename(columns={"text_change": "text_change_count"})

In [None]:
# Calculating text changes per minute
df['text_changes_per_min'] = df["text_change_count"] / df['writing_time_min']

In [None]:
# Calculating the number of sentences (based on the number of full stops)
sentence_filter = df_raw[(df_raw["text_change"].str.match('\.')) & (df_raw['activity'] != 'Remove/Cut')]
sentence_filter = df_raw[df_raw["up_event"] == '.']
df_sentence_count = sentence_filter.groupby("id")['text_change'].count()

df = pd.merge(df, df_sentence_count, on="id", how="left")

In [None]:
df = df.rename(columns={"text_change": "sentence_count"})

In [None]:
# Calculating total number of different activities
for activity in ['Input', 'Remove/Cut', 'Paste', 'Replace', 'Nonproduction']:
    df_activity_count = df_raw[df_raw['activity'] == activity].groupby("id")['activity'].count()
    column_name = f'{activity}_count'
    df_activity_count.name = column_name
    df = pd.merge(df, df_activity_count, on="id", how="left")
    df[column_name] = df[column_name].fillna(0)

In [None]:
# Pauses
df_raw['IKI'] = df_raw['down_time'] - df_raw.groupby('id')['up_time'].shift(1)
df_raw['IKI'] = df_raw['IKI'].fillna(0)
df_raw['IKI'] = df_raw['IKI'].clip(0)

PAUSE_THRESHOLD = 2000

# Total number of pauses (over 2000 ms) during writing process
pauses_count = df_raw[df_raw['IKI'] > PAUSE_THRESHOLD].groupby('id')['IKI'].count()
pauses_count.name = 'pauses_count'
df = pd.merge(df, pauses_count, on="id", how="left")

In [None]:
# Average duration of a pause
pauses_mean_duration = df_raw[df_raw['IKI'] > PAUSE_THRESHOLD].groupby('id')['IKI'].mean().round(0)
pauses_mean_duration.name = 'pause_mean_duration'
df = pd.merge(df, pauses_mean_duration, on="id", how="left")

In [None]:
# Calculating total time of pauses
pauses_time_sum = df_raw[df_raw['IKI'] > PAUSE_THRESHOLD].groupby('id')['IKI'].sum()
pauses_time_sum.name = 'pauses_time_sum'
df = pd.merge(df, pauses_time_sum, on="id", how="left")

In [None]:
# Calculating writing time in ms
writing_time_ms = df_raw.groupby('id')['up_time'].max()
writing_time_ms.name = 'writing_time_ms'
df = pd.merge(df, writing_time_ms, on="id", how="left")

In [None]:
# Average number of pauses per minute
df['pause_per_min'] = (df["pauses_count"] / df['writing_time_min']).round(2)

# Proportion of pauses during the writing process
df['pauses_share'] = (df["pauses_time_sum"] / df['writing_time_ms']).round(2)

In [None]:
df

Unnamed: 0,id,word_count,event_count,writing_time_min,events_per_min,text_change_count,text_changes_per_min,sentence_count,Input_count,Remove/Cut_count,Paste_count,Replace_count,Nonproduction_count,pauses_count,pause_mean_duration,pauses_time_sum,writing_time_ms,pause_per_min,pauses_share
0,001519c8,256.0,2557,30,85.233333,2437,81.233333,21,2010,417,0.0,7.0,120,124,9537.0,1182600.0,1801969,4.13,0.66
1,0022f953,323.0,2454,29,84.62069,2200,75.862069,15,1938,260,1.0,1.0,254,80,14678.0,1174268.0,1788969,2.76,0.66
2,0042269b,404.0,4136,29,142.62069,3961,136.586207,21,3515,439,0.0,7.0,175,77,14431.0,1111167.0,1771669,2.66,0.63
3,0059420b,206.0,1556,23,67.652174,1457,63.347826,13,1304,151,1.0,1.0,99,87,8828.0,768075.0,1404469,3.78,0.55
4,0075873a,252.0,2531,27,93.740741,2459,91.074074,23,1942,517,0.0,0.0,72,88,11880.0,1045463.0,1662472,3.26,0.63
5,0081af50,275.0,2211,29,76.241379,2135,73.62069,11,1794,338,0.0,3.0,76,49,23577.0,1155282.0,1778916,1.69,0.65
6,0093f095,242.0,1765,29,60.862069,1731,59.689655,15,1583,148,0.0,0.0,34,39,35156.0,1371097.0,1768197,1.34,0.78
7,009e23ab,308.0,2353,29,81.137931,2198,75.793103,18,1975,222,0.0,1.0,155,56,20261.0,1134643.0,1799303,1.93,0.63
8,00e048f1,223.0,1585,29,54.655172,1544,53.241379,23,1426,118,0.0,0.0,41,95,13017.0,1236616.0,1797498,3.28,0.69
9,00e1f05a,739.0,7826,30,260.866667,7598,253.266667,29,6145,1446,0.0,7.0,228,88,5416.0,476630.0,1853697,2.93,0.26


In [None]:
# calcullating the start and end time of writing and most frequent activity
df_freq = df_raw.groupby("id").agg({
    'down_time': ['min', 'max'],
    'activity': lambda x: x.value_counts().index[0]
}).reset_index()

In [None]:
df_freq.columns = ['id', 'start_time', 'end_time', 'most_frequent_activity']

In [None]:
df_freq

Unnamed: 0,id,start_time,end_time,most_frequent_activity
0,001519c8,4526,1801877,Input
1,0022f953,30623,1788842,Input
2,0042269b,4441,1771219,Input
3,0059420b,41395,1404394,Input
4,0075873a,78470,1662390,Input
5,0081af50,42636,1778845,Input
6,0093f095,6572,1768065,Input
7,009e23ab,106549,1799221,Input
8,00e048f1,5273,1797305,Input
9,00e1f05a,10731,1853533,Input


In [None]:
df = pd.merge(df, df_freq, on="id", how='left')
df.head()

Unnamed: 0,id,word_count,event_count,writing_time_min,events_per_min,text_change_count,text_changes_per_min,sentence_count,Input_count,Remove/Cut_count,...,Nonproduction_count,pauses_count,pause_mean_duration,pauses_time_sum,writing_time_ms,pause_per_min,pauses_share,start_time,end_time,most_frequent_activity
0,001519c8,256.0,2557,30,85.233333,2437,81.233333,21,2010,417,...,120,124,9537.0,1182600.0,1801969,4.13,0.66,4526,1801877,Input
1,0022f953,323.0,2454,29,84.62069,2200,75.862069,15,1938,260,...,254,80,14678.0,1174268.0,1788969,2.76,0.66,30623,1788842,Input
2,0042269b,404.0,4136,29,142.62069,3961,136.586207,21,3515,439,...,175,77,14431.0,1111167.0,1771669,2.66,0.63,4441,1771219,Input
3,0059420b,206.0,1556,23,67.652174,1457,63.347826,13,1304,151,...,99,87,8828.0,768075.0,1404469,3.78,0.55,41395,1404394,Input
4,0075873a,252.0,2531,27,93.740741,2459,91.074074,23,1942,517,...,72,88,11880.0,1045463.0,1662472,3.26,0.63,78470,1662390,Input


In [None]:
# Calculating Average action time
df['total_writing_time'] = df['end_time'] - df['start_time']
df['Avg_time_between_events'] = df['total_writing_time'] / df['event_count']
df.head()

Unnamed: 0,id,word_count,event_count,writing_time_min,events_per_min,text_change_count,text_changes_per_min,sentence_count,Input_count,Remove/Cut_count,...,pause_mean_duration,pauses_time_sum,writing_time_ms,pause_per_min,pauses_share,start_time,end_time,most_frequent_activity,total_writing_time,Avg_time_between_events
0,001519c8,256.0,2557,30,85.233333,2437,81.233333,21,2010,417,...,9537.0,1182600.0,1801969,4.13,0.66,4526,1801877,Input,1797351,702.913962
1,0022f953,323.0,2454,29,84.62069,2200,75.862069,15,1938,260,...,14678.0,1174268.0,1788969,2.76,0.66,30623,1788842,Input,1758219,716.47066
2,0042269b,404.0,4136,29,142.62069,3961,136.586207,21,3515,439,...,14431.0,1111167.0,1771669,2.66,0.63,4441,1771219,Input,1766778,427.170696
3,0059420b,206.0,1556,23,67.652174,1457,63.347826,13,1304,151,...,8828.0,768075.0,1404469,3.78,0.55,41395,1404394,Input,1362999,875.963368
4,0075873a,252.0,2531,27,93.740741,2459,91.074074,23,1942,517,...,11880.0,1045463.0,1662472,3.26,0.63,78470,1662390,Input,1583920,625.807981


In [None]:
# If overtime writing (if the writing time exceeded 30 minutes)
df['overtime_writing'] = df['writing_time_min'] > 30

In [None]:
# Average Action Time - average duration of the actions for each essay.
# Summing the durations of all actions in an essay and then dividing by the total number of actions.
# I leave values in the milliseconds due to it is very short in duration

total_action_time = df_raw.groupby('id')['action_time'].sum()
df = pd.merge(df, total_action_time, on='id', how='left')

df['average_action_time'] = df['action_time'] / df['event_count']
df = df.drop(columns=['action_time'])

In [None]:
# Total num of characters in the essay, including spaces etc:

# 'Input' and 'Paste' events, count each event as one character added
# "Remove/Cut" event as one character removed.
# I think the "Replace" action does not affect the total character count, as it's essentially a one-for-one substitution.

df_raw['chars_added'] = df_raw['activity'].apply(lambda x: 1 if x in ['Input', 'Paste'] else 0)
df_raw['chars_removed'] = df_raw['activity'].apply(lambda x: 1 if x == 'Remove/Cut' else 0)

total_chars_added = df_raw.groupby('id')['chars_added'].sum()
total_chars_removed = df_raw.groupby('id')['chars_removed'].sum()

df = pd.merge(df, total_chars_added, on='id', how='left')
df = pd.merge(df, total_chars_removed, on='id', how='left')

df['total_characters'] = df['chars_added'] - df['chars_removed']
df.drop(columns=['chars_added', 'chars_removed'], inplace=True)

In [None]:
# Average Characters per Minute - dividion of the total number of characters by the total writing time in minutes
# I left these values in float type, not rounded to the int, to keep the precision, but it can be converted to the int if needed
df['avg_characters_per_min'] = df['total_characters'] / df['writing_time_min']
df.head()

Unnamed: 0,id,word_count,event_count,writing_time_min,events_per_min,text_change_count,text_changes_per_min,sentence_count,Input_count,Remove/Cut_count,...,pauses_share,start_time,end_time,most_frequent_activity,total_writing_time,Avg_time_between_events,overtime_writing,average_action_time,total_characters,avg_characters_per_min
0,001519c8,256.0,2557,30,85.233333,2437,81.233333,21,2010,417,...,0.66,4526,1801877,Input,1797351,702.913962,False,116.246774,1593,53.1
1,0022f953,323.0,2454,29,84.62069,2200,75.862069,15,1938,260,...,0.66,30623,1788842,Input,1758219,716.47066,False,112.221271,1679,57.896552
2,0042269b,404.0,4136,29,142.62069,3961,136.586207,21,3515,439,...,0.63,4441,1771219,Input,1766778,427.170696,False,101.837766,3076,106.068966
3,0059420b,206.0,1556,23,67.652174,1457,63.347826,13,1304,151,...,0.55,41395,1404394,Input,1362999,875.963368,False,121.848329,1154,50.173913
4,0075873a,252.0,2531,27,93.740741,2459,91.074074,23,1942,517,...,0.63,78470,1662390,Input,1583920,625.807981,False,123.943896,1425,52.777778


# **Multicorrelation**

In [None]:
df_labels = pd.read_csv("/content/train_scores.csv")
df = pd.merge(df, df_labels, on='id', how='left')


In [None]:
df.head()

Unnamed: 0,id,word_count,event_count,writing_time_min,events_per_min,text_change_count,text_changes_per_min,sentence_count,Input_count,Remove/Cut_count,...,start_time,end_time,most_frequent_activity,total_writing_time,Avg_time_between_events,overtime_writing,average_action_time,total_characters,avg_characters_per_min,score
0,001519c8,256.0,2557,30,85.233333,2437,81.233333,21,2010,417,...,4526,1801877,Input,1797351,702.913962,False,116.246774,1593,53.1,3.5
1,0022f953,323.0,2454,29,84.62069,2200,75.862069,15,1938,260,...,30623,1788842,Input,1758219,716.47066,False,112.221271,1679,57.896552,3.5
2,0042269b,404.0,4136,29,142.62069,3961,136.586207,21,3515,439,...,4441,1771219,Input,1766778,427.170696,False,101.837766,3076,106.068966,6.0
3,0059420b,206.0,1556,23,67.652174,1457,63.347826,13,1304,151,...,41395,1404394,Input,1362999,875.963368,False,121.848329,1154,50.173913,2.0
4,0075873a,252.0,2531,27,93.740741,2459,91.074074,23,1942,517,...,78470,1662390,Input,1583920,625.807981,False,123.943896,1425,52.777778,4.0




In [None]:
#Pearson correlation matrix

matrix = df.corr()
matrix


  matrix = df.corr()


Unnamed: 0,word_count,event_count,writing_time_min,events_per_min,text_change_count,text_changes_per_min,sentence_count,Input_count,Remove/Cut_count,Paste_count,...,pauses_share,start_time,end_time,total_writing_time,Avg_time_between_events,overtime_writing,average_action_time,total_characters,avg_characters_per_min,score
word_count,1.0,0.894081,0.164396,0.892945,0.869069,0.862468,0.317088,0.929413,0.483971,-0.256089,...,-0.561314,-0.399665,0.1739,0.236583,-0.802892,,-0.315138,0.974479,0.950923,0.503667
event_count,0.894081,1.0,0.264917,0.974457,0.990029,0.958161,0.483413,0.987472,0.793825,-0.135049,...,-0.605653,-0.427107,0.28277,0.343631,-0.851215,,-0.182103,0.934385,0.874521,0.449735
writing_time_min,0.164396,0.264917,1.0,0.051664,0.241574,0.022116,0.192683,0.237625,0.202819,-0.207168,...,0.047853,-0.28135,0.994983,0.983962,0.014419,,-0.395341,0.220486,-0.046724,0.151154
events_per_min,0.892945,0.974457,0.051664,1.0,0.968602,0.989302,0.473635,0.968955,0.767404,-0.115913,...,-0.632582,-0.371209,0.070994,0.134936,-0.896668,,-0.098438,0.921041,0.924618,0.445174
text_change_count,0.869069,0.990029,0.241574,0.968602,1.0,0.972923,0.444017,0.987038,0.837002,-0.12554,...,-0.595896,-0.427028,0.256696,0.31919,-0.84457,,-0.155621,0.91825,0.86396,0.491823
text_changes_per_min,0.862468,0.958161,0.022116,0.989302,0.972923,1.0,0.43123,0.962751,0.806517,-0.107226,...,-0.620475,-0.366214,0.038583,0.103653,-0.886391,,-0.068135,0.899239,0.910452,0.486107
sentence_count,0.317088,0.483413,0.192683,0.473635,0.444017,0.43123,1.0,0.423846,0.421181,0.077316,...,-0.476449,-0.101822,0.212675,0.218003,-0.450397,,-0.048604,0.372065,0.34199,0.126413
Input_count,0.929413,0.987472,0.237625,0.968955,0.987038,0.962751,0.423846,1.0,0.738345,-0.186745,...,-0.587845,-0.438259,0.249768,0.31477,-0.850079,,-0.223007,0.969899,0.920358,0.537189
Remove/Cut_count,0.483971,0.793825,0.202819,0.767404,0.837002,0.806517,0.421181,0.738345,1.0,0.107956,...,-0.502886,-0.297945,0.225323,0.266006,-0.650569,,0.104839,0.551892,0.493214,0.233326
Paste_count,-0.256089,-0.135049,-0.207168,-0.115913,-0.12554,-0.107226,0.077316,-0.186745,0.107956,1.0,...,0.076094,-0.144183,-0.210752,-0.170853,0.010059,,0.437146,-0.2694,-0.255446,-0.209807



Multicollinearity is indicated by the fact that the correlation between some pairs of features is stronger than their correlation with the dependent variable (score). For example the matrix shows the maximum
correlation between the characteristics total_writing_time and writing_time_min (0.99) and it is greater than the correlation of either argument with the investigated characteristic, so we can draw the conclusion that these 2 characteristics are too strongly related to each other and in the same model both should not appear at the same time.

To simplify the task we will choose a treshold of 0.9, considering that to be a very strong correlation.

We will start with writing down pairs of features that are highly correlated.

#total_characters - word_count

events_per_min - event_count

text_change_count - event_count

Input_count - event_count

#end_time - writing_time_min

#writing_time_ms - writing_time_min

text_changes_per_min - events_per_min

total_characters - text_change_count

text_changes_per_min - text_change_count

Input_count - text_change_count

text_changes_per_min - Input_count

text_changes_per_min - avg_characters_per_min

#Input_count - word_count

Input_count - total_characters

#writing_time_ms - end_time

#total_characters - avg_characters_per_min

#pause_per_min - pause_count


In [None]:
import copy
df_model = copy.deepcopy(df)


In [None]:
#We will drop Input_count, writing_time_min, end_time, avg_characters_per_min, pause_per_min,
#events_per_min, total_characters,text_change_count.


#Drop one feature from the pair.

columns_to_drop = ['Input_count', 'writing_time_min', 'end_time', 'avg_characters_per_min',  'pause_per_min', 'events_per_min', 'total_characters', 'text_change_count']
df_model.drop(columns=columns_to_drop , inplace=True)
df_model.corr()



  df_model.corr()


Unnamed: 0,word_count,event_count,text_changes_per_min,sentence_count,Remove/Cut_count,Paste_count,Replace_count,Nonproduction_count,pauses_count,pause_mean_duration,pauses_time_sum,writing_time_ms,pauses_share,start_time,total_writing_time,Avg_time_between_events,overtime_writing,average_action_time,score
word_count,1.0,0.894081,0.862468,0.317088,0.483971,-0.256089,0.371655,0.44825,-0.088885,-0.183476,-0.444764,0.1738,-0.561314,-0.399665,0.236583,-0.802892,,-0.315138,0.503667
event_count,0.894081,1.0,0.958161,0.483413,0.793825,-0.135049,0.544695,0.383459,-0.072749,-0.205628,-0.453384,0.282755,-0.605653,-0.427107,0.343631,-0.851215,,-0.182103,0.449735
text_changes_per_min,0.862468,0.958161,1.0,0.43123,0.806517,-0.107226,0.514918,0.207788,-0.099412,-0.263138,-0.555919,0.038562,-0.620475,-0.366214,0.103653,-0.886391,,-0.068135,0.486107
sentence_count,0.317088,0.483413,0.43123,1.0,0.421181,0.077316,0.217099,0.411983,0.361234,-0.402452,-0.351691,0.212887,-0.476449,-0.101822,0.218003,-0.450397,,-0.048604,0.126413
Remove/Cut_count,0.483971,0.793825,0.806517,0.421181,1.0,0.107956,0.55596,-0.030475,0.006713,-0.226553,-0.39006,0.225277,-0.502886,-0.297945,0.266006,-0.650569,,0.104839,0.233326
Paste_count,-0.256089,-0.135049,-0.107226,0.077316,0.107956,1.0,-0.003602,-0.105182,0.249839,-0.219881,-0.02286,-0.210706,0.076094,-0.144183,-0.170853,0.010059,,0.437146,-0.209807
Replace_count,0.371655,0.544695,0.514918,0.217099,0.55596,-0.003602,1.0,-0.088043,0.054943,-0.088719,0.094317,0.359542,-0.045663,-0.399632,0.410487,-0.479666,,0.075032,0.456675
Nonproduction_count,0.44825,0.383459,0.207788,0.411983,-0.030475,-0.105182,-0.088043,1.0,-0.116491,0.062121,-0.131502,0.261667,-0.256432,-0.136632,0.270265,-0.314074,,-0.231835,-0.132911
pauses_count,-0.088885,-0.072749,-0.099412,0.361234,0.006713,0.249839,0.054943,-0.116491,1.0,-0.800815,-0.274899,0.102774,-0.339452,-0.060669,0.107364,0.020973,,0.063546,-0.047066
pause_mean_duration,-0.183476,-0.205628,-0.263138,-0.402452,-0.226553,-0.219881,-0.088719,0.062121,-0.800815,1.0,0.676482,0.194204,0.659159,-0.038102,0.189035,0.319414,,-0.132422,0.027737


In [None]:
df_model.head()

Unnamed: 0,id,word_count,event_count,text_changes_per_min,sentence_count,Remove/Cut_count,Paste_count,Replace_count,Nonproduction_count,pauses_count,...,pauses_time_sum,writing_time_ms,pauses_share,start_time,most_frequent_activity,total_writing_time,Avg_time_between_events,overtime_writing,average_action_time,score
0,001519c8,256.0,2557,81.233333,21,417,0.0,7.0,120,124,...,1182600.0,1801969,0.66,4526,Input,1797351,702.913962,False,116.246774,3.5
1,0022f953,323.0,2454,75.862069,15,260,1.0,1.0,254,80,...,1174268.0,1788969,0.66,30623,Input,1758219,716.47066,False,112.221271,3.5
2,0042269b,404.0,4136,136.586207,21,439,0.0,7.0,175,77,...,1111167.0,1771669,0.63,4441,Input,1766778,427.170696,False,101.837766,6.0
3,0059420b,206.0,1556,63.347826,13,151,1.0,1.0,99,87,...,768075.0,1404469,0.55,41395,Input,1362999,875.963368,False,121.848329,2.0
4,0075873a,252.0,2531,91.074074,23,517,0.0,0.0,72,88,...,1045463.0,1662472,0.63,78470,Input,1583920,625.807981,False,123.943896,4.0


In [None]:
from sklearn.model_selection import train_test_split
X = df_model.drop(['id', 'most_frequent_activity'], axis=1)
X = X.fillna(0)

#X_test = X[X['is_test'] == True].drop(['is_test'], axis=1)
#X = X[X['is_test'] == False].drop(['is_test'], axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, df_labels['score'], random_state=0, test_size=0.20, stratify = df_labels['score'])



ValueError: ignored

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

random_state = np.random.seed(0) # random seed for consistency

RFR = RandomForestRegressor(max_depth=2, random_state=random_state)

RFR.fit(X_train, y_train)

predictions_RFR = RFR.predict(X_val.to_numpy())

rmse_RFR = mean_squared_error(y_val, predictions_RFR, squared = False)
print(rmse_RFR)

In [None]:
import statsmodels.api as sm

# Define independent variables (predictors)
X = df[['word_count', 'total_writing_time']]

# Define dependent variable
y = df['score']

# Add a constant term to the independent variables matrix (for intercept)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the regression results
print(model.summary())

In [None]:
#Aleksandra
# Activity column group by activities
# Pauses (total number, per minute, average)

#Yaroslava:
# Number of characters (total in the essay, avg per minute)
# Overtime (yes or no?)
# Average action time

#Ehab
# Most frequent activity
# Average time between events

#Anastastija
# Move From explore futher..
# Number of sentences based on the number of full stops/excl. marks

# Train labels (essay grades)


In [None]:
df_labels = pd.read_csv("../data/train_scores.csv")
df_labels.head()

In [None]:
import matplotlib.pyplot as plt

plt.hist(df_labels['score'], bins = 12)
plt.show()

In [None]:
df_labels.groupby('score')['id'].count().plot.bar(width =0.8)