In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings
import re
warnings.filterwarnings("ignore")

In [None]:
# Load the training and test data
train_logs = pd.read_csv('/content/drive/Shareddrives/CSCI567/CSCI567/writing_quality/train_logs.csv')
test_logs = pd.read_csv('/content/drive/Shareddrives/CSCI567/CSCI567/writing_quality/test_logs.csv')
train_scores = pd.read_csv('/content/drive/Shareddrives/CSCI567/CSCI567/writing_quality/train_scores.csv')

In [None]:
#num_backspaces
dhruv = train_logs[train_logs['down_event'] == 'Backspace'].groupby('id').size().reset_index(name='num_backspaces')

(2465, 2)

In [None]:
#duration_backspaces
result = pd.DataFrame()
remove_cut_data = train_logs[train_logs['down_event'] == 'Backspace']
result['duration_backspaces'] = remove_cut_data.groupby('id')['action_time'].sum()
dhruv = pd.merge(dhruv, result, on='id', how="outer")

In [None]:
#num_cut/copy/paste
train_logs['activity_type'] = train_logs['activity'].apply(lambda x: "Remove/Cut" if "Remove/Cut" in x else "Paste" if "Paste" in x else "Other")
activity_count = train_logs.groupby('id')['activity_type'].value_counts().unstack().fillna(0)
activity_count['num_cut/copy/paste'] = activity_count['Remove/Cut'].astype(int) + activity_count['Paste'].astype(int)
activity_count.drop(columns=["Other", "Paste", "Remove/Cut"], axis=1, inplace=True)
dhruv = pd.merge(dhruv, activity_count, on='id', how="outer")

In [None]:
#Discarded_text
filtered_df = train_logs[train_logs["activity"].isin(["Remove/Cut", "Replace"])]

filtered_df['left_text'] = filtered_df['text_change'].str.split(' =>').str[0]
filtered_df['discarded_text'] = filtered_df['left_text'].apply(len)
char_count_by_id = filtered_df.groupby('id')['discarded_text'].sum().reset_index()

dhruv = pd.merge(dhruv, char_count_by_id, on='id',how="outer")

In [None]:
#D/I Ratio
filtered_data = train_logs[train_logs['activity'].isin(['Remove/Cut', 'Paste', 'Replace'])]

# Split on =>
filtered_data['left_text'] = filtered_data['text_change'].str.split(' =>').str[0]
filtered_data['deletion'] = filtered_data['left_text'].apply(len)

char_count_by_id = filtered_data.groupby('id')['deletion'].sum().reset_index()

input_activity_count = train_logs[train_logs['activity'] == 'Input'].groupby('id').size().reset_index(name='input')

result = pd.merge(char_count_by_id, input_activity_count, on='id', how='left')
result.fillna(0, inplace=True)

result['D/I Ratio']=result['deletion']/result['input']

result.drop(columns=["deletion", "input"], axis=1, inplace=True)

dhruv = pd.merge(dhruv, result, on='id', how="outer")

In [None]:
#Proportion of deletions
filtered_data = train_logs[train_logs['activity'].isin(['Remove/Cut', 'Paste', 'Replace'])]

# Split on =>
filtered_data['left_text'] = filtered_data['text_change'].str.split(' =>').str[0]
filtered_data['deletion'] = filtered_data['left_text'].apply(len)

char_count_by_id = filtered_data.groupby('id')['deletion'].sum().reset_index()

total_writing_time = train_logs.groupby('id')['action_time'].sum().reset_index()
result = pd.merge(char_count_by_id, total_writing_time, on='id', how='left')
result.fillna(0, inplace=True)
result['Proportion_of_Deletions']=result['deletion']/result['action_time']*100
result.drop(columns=["deletion", "action_time"], axis=1, inplace=True)
dhruv = pd.merge(dhruv, result, on='id', how="outer")

In [None]:
#No of distant revisions
filtered_data = train_logs
filtered_data['cursor_position_diff'] = filtered_data['cursor_position'].diff().abs()
filtered_data['cursor_position_diff'] = filtered_data['cursor_position_diff'].fillna(0)

condition = (filtered_data['cursor_position_diff'] > 0)

# Count the number of rows that satisfy the condition
count_per_id = filtered_data[condition].groupby('id').size().reset_index(name='no_distant_revision')

dhruv = pd.merge(dhruv, count_per_id, on='id', how="outer")

In [None]:
#No of immediate revisions
filtered_data = train_logs
filtered_data['cursor_position_diff'] = filtered_data['cursor_position'].diff().abs()
filtered_data['cursor_position_diff'] = filtered_data['cursor_position_diff'].fillna(0)

# Define your condition
condition = (filtered_data['cursor_position_diff'] == 0)

count_per_id = filtered_data[condition].groupby('id').size().reset_index(name='no_immediate_revision')

dhruv = pd.merge(dhruv, count_per_id, on='id', how="outer")

In [None]:
def count_major_edits(group):
    space_count = 0
    in_major_edit = 0
    char = 0
    temp_time = []
    cur_time = 0
    total_time = 0

    for _, row in group.iterrows():
        if row['activity'] == 'Remove/Cut':
            if row['text_change'] == 'q':
                char += 1
                temp_time.append(row['action_time'])
            if char > 2:
                in_major_edit += 1
                char = 0
                cur_time += sum(temp_time)
                temp_time.clear()

        elif row['activity'] == 'Replace':
            row['left_text'] = (row['text_change'].split(' =>')[0]).split()
            count_len=0
            for i in row['left_text']:
                count_len += len(i)
            in_major_edit+=count_len//3
            cur_time += row['action_time']
        else:
            total_time += cur_time
            char = 0
            cur_time = 0
    total_time += cur_time

    return in_major_edit, total_time


In [None]:
major_edits_count = train_logs.groupby('id').apply(count_major_edits).reset_index(name='major_edits_count_total_time')

major_edits_count[['major_edits_count', 'multi_word_deletion_time']] = pd.DataFrame(major_edits_count['major_edits_count_total_time'].tolist(), index=major_edits_count.index)

major_edits_count = major_edits_count.drop(columns=['major_edits_count_total_time'])

max_event_id_per_essay = train_logs.groupby('id')['event_id'].max().reset_index(name='max_event_id')

result = pd.merge(major_edits_count, max_event_id_per_essay, on='id')

# Calculate the frequency of major edits by dividing major_edits_count by the length of the essay
result['major_edits_freq'] = result['major_edits_count'] / result['max_event_id']
result.drop(columns='max_event_id', inplace=True, axis=1)
dhruv = pd.merge(dhruv, result, on='id', how="outer")

In [None]:
dhruv.isnull().values.any()
dhruv.isnull().sum().sum()

In [None]:
dhruv=dhruv.fillna(0)

In [None]:
dhruv.to_csv("dhruv_anuranjan_df.csv", index=False)

In [None]:
#-------KRITI----------
df_filtered = train_logs[train_logs['activity'].isin(['Remove/Cut', 'Input', 'Paste', 'Replace'])]
# Filter the DataFrame for 'Remove/Cut' activities and count 'q's
remove_cut_df = df_filtered[df_filtered['activity'] == 'Remove/Cut']
remove_cut_df['q_removed_count'] = remove_cut_df['text_change'].str.count('q')
total_qs_removed_cut = remove_cut_df.groupby('id')['q_removed_count'].sum()

# Filter the DataFrame for 'Replace' activities
replace_df = df_filtered[df_filtered['activity'] == 'Replace']

# Extract the text before the '=>' symbol
replace_df['text_before_replace'] = replace_df['text_change'].str.split(' =>').str[0]

# Count the occurrences of 'q'
replace_df['q_removed_count'] = replace_df['text_before_replace'].str.count('q')

total_qs_removed_replace = replace_df.groupby('id')['q_removed_count'].sum()
total_qs_removed = total_qs_removed_cut.add(total_qs_removed_replace, fill_value=0)
total_qs_removed_df = total_qs_removed.reset_index(name='total_q_removed')

# Filter DataFrame for 'Paste' activity and count 'q's
paste_df = df_filtered[df_filtered['activity'] == 'Paste']
paste_df['q_added_count'] = paste_df['text_change'].str.count('q')
total_qs_added_paste = paste_df.groupby('id')['q_added_count'].sum()

# Filter DataFrame for 'Replace' activity
replace_df = df_filtered[df_filtered['activity'] == 'Replace']

# Extract the text after the '=>' symbol
replace_df['text_after_replace'] = replace_df['text_change'].str.split('=>').str[1].fillna('')
replace_df['alphanumeric_added_count'] = replace_df['text_after_replace'].str.count('[a-zA-Z0-9]')
total_alphanumeric_added_replace = replace_df.groupby('id')['alphanumeric_added_count'].sum()

# Filter DataFrame for 'Input' activity and count 'q's
input_df = df_filtered[(df_filtered['activity'] == 'Input') & (df_filtered['text_change'].str.contains('q'))]
input_df['q_added_count'] = 1
total_qs_added_input = input_df.groupby('id').size()

# Combine the counts
total_qs_added = total_qs_added_paste.add(total_alphanumeric_added_replace, fill_value=0).add(total_qs_added_input, fill_value=0)
total_qs_added_df = total_qs_added.reset_index(name='total_q_added')

# Calculate net 'q's added by subtracting 'q's removed from 'q's added
net_qs_added = total_qs_added.subtract(total_qs_removed, fill_value=0)
net_qs_added_df = net_qs_added.reset_index(name='num_chars')
net_qs_added_df.head()

df_sorted = df_filtered.sort_values(by=['id', 'event_id'])

final_word_counts = df_sorted.groupby('id')['word_count'].last()
final_word_counts_df = final_word_counts.reset_index(name='final_word_count')

mean_word_length = net_qs_added / final_word_counts

mean_word_length_df = mean_word_length.reset_index(name='mean_word_length')

production_df = df_sorted[df_sorted['activity'] != 'Nonproduction']
num_keystrokes = production_df.groupby('id').size().reset_index(name='num_keystrokes')

production_df = df_sorted[df_sorted['activity'] != 'Nonproduction']

burst_data = []

for id, group in production_df.groupby('id'):
    burst_num = 0
    burst_start_index = 0

    for i in range(1, len(group)):
        pause = group.iloc[i]['down_time'] - group.iloc[i - 1]['up_time']
        if pause > 2000:
            burst_duration = group.iloc[i - 1]['up_time'] - group.iloc[burst_start_index]['down_time']
            burst_data.append({
                'id': id,
                'burst_num': burst_num,
                'burst_len': i - burst_start_index,
                'burst_duration': burst_duration
            })
            burst_num += 1
            burst_start_index = i

    # last case edge case
    if burst_start_index < len(group):
        burst_duration = group.iloc[-1]['up_time'] - group.iloc[burst_start_index]['down_time']
        burst_data.append({
            'id': id,
            'burst_num': burst_num,
            'burst_len': len(group) - burst_start_index,
            'burst_duration': burst_duration
        })

burst_df = pd.DataFrame(burst_data)

# num_bursts
num_bursts = burst_df.groupby('id')['burst_num'].nunique().reset_index(name='num_bursts')
num_bursts.head()

# mean burst length and duration
mean_burst_metrics = burst_df.groupby('id').agg({
    'burst_len': 'mean',
    'burst_duration': 'mean'
}).rename(columns={'burst_len': 'mean_burst_len', 'burst_duration': 'mean_burst_duration'})

# standard deviation of burst length and duration
std_burst_metrics = burst_df.groupby('id').agg({
    'burst_len': 'std',
    'burst_duration': 'std'
}).rename(columns={'burst_len': 'std_burst_len', 'burst_duration': 'std_burst_duration'})

burst_metrics = pd.merge(mean_burst_metrics, std_burst_metrics, on='id', how="outer")

#Merge net_qs_added_df with mean_word_length_df
combined_df_kriti = pd.merge(net_qs_added_df, mean_word_length_df, on='id', how="outer")

#Merge the result with num_keystrokes
combined_df_kriti = pd.merge(combined_df_kriti, num_keystrokes, on='id', how="outer")

#Merge the result with num_bursts
combined_df_kriti = pd.merge(combined_df_kriti, num_bursts, on='id', how="outer")

# Merge the result with burst_metrics
combined_df_kriti = pd.merge(combined_df_kriti, burst_metrics, on='id', how="outer")

In [None]:
combined_df_kriti.isnull().values.any()
combined_df_kriti.isnull().sum().sum()

In [None]:
combined_df_kriti.to_csv("kriti_df.csv", index=False)

In [None]:
#Pratyush
#keystrokes_per_essay
keystrokes_per_essay = train_logs.groupby('id')['event_id'].max()
keystrokes_per_essay_df = pd.DataFrame(keystrokes_per_essay).reset_index()
keystrokes_per_essay_df.columns = ['id', 'keystrokes_per_essay']

#total time per essay
total_time_per_essay = train_logs.groupby('id')['up_time'].max()

#total pause time per essay
pause_time_per_row = train_logs['down_time'] - train_logs['up_time'].shift(fill_value=0)
pause_time_per_row = pause_time_per_row.where(train_logs['id'] == train_logs['id'].shift(fill_value=train_logs['id'].iloc[0]), other=0)
pause_time_per_row = pause_time_per_row.abs()

total_pause_time_per_essay = pause_time_per_row.groupby(train_logs['id']).sum()

#time_spent_typing_per_essay
time_spent_typing_per_essay = total_time_per_essay - total_pause_time_per_essay

parent_df_time = pd.DataFrame(list(total_time_per_essay.items()), columns=['id', 'total_time_per_essay'])
parent_df_pause_time = pd.DataFrame(list(total_pause_time_per_essay.items()), columns=['id', 'total_pause_time_per_essay'])

time_df = pd.merge(parent_df_time, parent_df_pause_time, on='id')

# Calculate time_spent_typing_per_essay
time_df['time_spent_typing_per_essay'] = time_df['total_time_per_essay'] - time_df['total_pause_time_per_essay']

# Calculate Rate of Keystrokes for each essay
time_df['rate_of_keystrokes'] = keystrokes_per_essay_df['keystrokes_per_essay'] / time_df['total_time_per_essay']

#non production time column
nonproduction_time_per_essay = train_logs.loc[train_logs['activity'] == 'Nonproduction'].groupby('id')['action_time'].sum()
nonproduction_time_df = pd.DataFrame(nonproduction_time_per_essay).reset_index()
nonproduction_time_df.columns = ['id', 'nonproduction_time_per_essay']

#merging the previoys column
time_df = pd.merge(time_df, nonproduction_time_df, on='id')

#cursor_move_count

cursor_move_count = {}

for essay_id, cursor_positions in train_logs.groupby('id')['cursor_position']:
    cursor_move_count[essay_id] = 0
    prev_cursor_position = None

    for cursor_position in cursor_positions:

        if prev_cursor_position is not None and cursor_position != prev_cursor_position:
            cursor_move_count[essay_id] += 1
        prev_cursor_position = cursor_position

dfs = []
parent_df = pd.DataFrame(columns=['id', 'cursor_move_count'])
for essay_id, cursor_positions in cursor_move_count.items():
    cursor_move_count_per_essay_df = pd.DataFrame({
        'id': [essay_id],
        'cursor_move_count': [cursor_positions],
    })
    dfs.append(cursor_move_count_per_essay_df)
parent_df = pd.concat(dfs, ignore_index=True)

#cursor_move_distance

cursor_move_distance = {}

for essay_id, cursor_positions in train_logs.groupby('id')['cursor_position']:
    move_distance = 0
    prev_cursor_position = None

    for cursor_position in cursor_positions:
        if prev_cursor_position is not None:
            move_distance += abs(cursor_position - prev_cursor_position)
        prev_cursor_position = cursor_position

    cursor_move_distance[essay_id] = move_distance

bfs = []
parent_df_1 = pd.DataFrame(columns=['id', 'cursor_move_distance'])
for essay_id, cursor_positions in cursor_move_distance.items():

    cursor_move_distance_per_essay_df = pd.DataFrame({
        'id': [essay_id],
        'cursor_move_distance': [cursor_positions],
    })
    bfs.append(cursor_move_distance_per_essay_df)

parent_df_1 = pd.concat(bfs, ignore_index=True)

#merging cursor move, dist and mean dist
parent_df_2 = pd.merge(parent_df, parent_df_1, on='id')

# Calculate mean distance per cursor move count
parent_df_2['cursor_move_mean_dist'] = parent_df_2['cursor_move_distance'] / parent_df_2['cursor_move_count']

#merging all columns until now
time_df = pd.merge(time_df, parent_df_2, on='id')

#cursor move from features
# List of activities other than 'Input', 'Nonproduction', 'Remove/Cut', 'Replace', and 'Paste'
allowed_activities = ['Input', 'Nonproduction', 'Remove/Cut', 'Replace', 'Paste']
# Count activities other than the allowed ones for each essay ID
cursor_move_count_new = train_logs[~train_logs['activity'].isin(allowed_activities)].groupby('id')['activity'].count()
cursor_move_count_new = cursor_move_count_new.fillna(0)

#cursor distance (y-diff)

# Function to extract y1 and y2 from 'Move From [x1, y1] To [x2, y2]' format
def extract_y_difference(activity):
    pattern = r'Move From \[\d+, (\d+)\] To \[\d+, (\d+)\]'
    match = re.findall(pattern, activity)
    if match:
        y1, y2 = map(int, match[0])
        return abs(y2 - y1)
    else:
        return 0

# Calculate y difference for 'Move From [x1, y1] To [x2, y2]' entries
train_logs['y_difference'] = train_logs['activity'].apply(lambda x: extract_y_difference(x) if 'Move From' in x else 0)
y_difference_per_essay = train_logs.groupby('id')['y_difference'].sum()
y_difference_per_essay = y_difference_per_essay.fillna(0)

#merging both
result_df_new = pd.concat([cursor_move_count_new, y_difference_per_essay], axis=1)
result_df_new.columns = ['cursor_move_count_new' ,'y_difference_per_essay']

#merging all features until now
time_df = pd.merge(time_df, result_df_new, on='id')

#no of insertions
num_insertions_dict = {}

for essay_id, group in train_logs.groupby('id'):
    max_cursor_position = 0
    num_insertions = 0

    for index, row in group.iterrows():
        cursor_position = row['cursor_position']
        activity = row['activity']

        max_cursor_position = max(max_cursor_position, cursor_position)

        if cursor_position < max_cursor_position:
            # Check if the activity is 'Input' or 'Replace'
            if activity in ['Input', 'Replace']:
                num_insertions += 1

    num_insertions_dict[essay_id] = num_insertions

tfs = []
parent_df_3 = pd.DataFrame(columns=['id', 'num_insertions'])
for essay_id, num_insertions in num_insertions_dict.items():
    num_insertions_df = pd.DataFrame({
        'id': [essay_id],
        'num_insertions_per_essay': [num_insertions],
    })
    tfs.append(num_insertions_df)

parent_df_3 = pd.concat(tfs, ignore_index=True)

#merging with all features
parent_df_4 = pd.merge(parent_df_3, time_df, on='id')
# Calculate mean distance per cursor move count
parent_df_4['proportion_of_insertions'] = parent_df_4['num_insertions_per_essay'] / parent_df_4['time_spent_typing_per_essay']

#extracting features in new df
new_df = pd.DataFrame()
new_df = parent_df_4[['id', 'num_insertions_per_essay', 'proportion_of_insertions']]

#merging all final features
time_df = pd.merge(time_df, new_df, on='id')

#extract the required features
time_df_final = pd.DataFrame()
time_df_final = time_df[['id', 'rate_of_keystrokes','nonproduction_time_per_essay', 'cursor_move_count', 'cursor_move_distance', 'cursor_move_mean_dist', 'cursor_move_count_new', 'y_difference_per_essay',  'num_insertions_per_essay', 'proportion_of_insertions']]


In [None]:
time_df_final.isnull().values.any()
time_df_final.isnull().sum().sum()

In [None]:
time_df_final=time_df_final.fillna(0)

In [None]:
time_df_final.to_csv("pratyush_df.csv", index=False)

In [None]:
# Nesar - cell 1/6

df_pause = train_logs

df_pause['interkey_interval'] = df_pause.groupby('id')['down_time'].diff()

df_pause['interkey_interval'] = df_pause['interkey_interval'].fillna(0)

iki_summary_data = df_pause.groupby('id')['interkey_interval'].agg(['sum', 'mean', 'std']).reset_index()

iki_summary_data = iki_summary_data.rename(columns={'sum': 'sum_IKI', 'mean': 'mean_IKI', 'std': 'SD_IKI'})

dhruv = pd.merge(dhruv, iki_summary_data, on='id')

            id    sum_IKI    mean_IKI       SD_IKI
0     001519c8  1797351.0  702.913962  4295.447374
1     0022f953  1758219.0  716.470660  4894.385161
2     0042269b  1766778.0  427.170696  3939.226278
3     0059420b  1362999.0  875.963368  4247.568454
4     0075873a  1583920.0  625.807981  3896.405072
...        ...        ...         ...          ...
2466  ffb8c745  1769114.0  373.309559  3457.675123
2467  ffbef7e5  1777392.0  682.562212  5632.013483
2468  ffccd6fd  1935791.0  631.991838  5399.385611
2469  ffec5b38  1488450.0  459.114744  3460.439398
2470  fff05981  2030338.0  561.021829  2987.199508

[2471 rows x 4 columns]


Unnamed: 0,id,num_backspaces,sum_IKI,mean_IKI,SD_IKI
0,001519c8,417,1797351.0,702.913962,4295.447374
1,0022f953,260,1758219.0,716.47066,4894.385161
2,0042269b,439,1766778.0,427.170696,3939.226278
3,0059420b,152,1362999.0,875.963368,4247.568454
4,0075873a,517,1583920.0,625.807981,3896.405072


In [None]:
# Nesar Cell 2/6

df = train_logs

df['pause'] = df['down_time'] - df.groupby('id')['up_time'].shift(1).fillna(0)

df['abs_pause'] = np.abs(df['pause'])

summary_data = df.groupby('id')['abs_pause'].agg(['sum', 'max', 'mean', 'var']).reset_index()
summary_data.columns = ['id', 'Total_Pause_Time', 'Longest_Pause_Time', 'Mean_Pause_Time', 'Variance_of_total_pause_time']

df['Num_Long_Pauses'] = np.where(df['abs_pause'] > 5000, 1, 0)
num_long_pauses = df.groupby('id')['Num_Long_Pauses'].sum().reset_index(name='Num_Long_Pauses')

summary_data = pd.merge(summary_data, num_long_pauses, on='id', how='outer').fillna(0)
summary_data['Percentage_of_Longest_Pauses'] = summary_data['Num_Long_Pauses'] / df.groupby('id')['abs_pause'].transform('count')
summary_data.to_csv("outputcompare.csv", index=False)

dhruv = pd.merge(dhruv, summary_data, on='id')

            id  Total_Pause_Time  Longest_Pause_Time  Mean_Pause_Time  \
0     001519c8         1566710.0            154136.0       612.714118   
1     0022f953         1552368.0            145899.0       632.586797   
2     0042269b         1495492.0            153886.0       361.579304   
3     0059420b         1234009.0            101690.0       793.064910   
4     0075873a         1403502.0            110688.0       554.524694   
...        ...               ...                 ...              ...   
2466  ffb8c745         1392075.0            128570.0       293.748681   
2467  ffbef7e5         1585157.0            267869.0       608.739247   
2468  ffccd6fd         1729045.0            229804.0       564.493960   
2469  ffec5b38         1254237.0            127733.0       386.871376   
2470  fff05981         1793116.0            137607.0       495.472783   

      Variance_of_total_pause_time  Num_Long_Pauses  \
0                     1.840835e+07               60   
1            

Unnamed: 0,id,num_backspaces,sum_IKI,mean_IKI,SD_IKI,Total_Pause_Time,Longest_Pause_Time,Mean_Pause_Time,Variance_of_total_pause_time,Num_Long_Pauses,Percentage_of_Longest_Pauses
0,001519c8,417,1797351.0,702.913962,4295.447374,1566710.0,154136.0,612.714118,18408350.0,60,0.023465
1,0022f953,260,1758219.0,716.47066,4894.385161,1552368.0,145899.0,632.586797,24321250.0,48,0.018772
2,0042269b,439,1766778.0,427.170696,3939.226278,1495492.0,153886.0,361.579304,15479010.0,32,0.012515
3,0059420b,152,1362999.0,875.963368,4247.568454,1234009.0,101690.0,793.06491,19026380.0,31,0.012124
4,0075873a,517,1583920.0,625.807981,3896.405072,1403502.0,110688.0,554.524694,17552740.0,48,0.018772


In [None]:
# Nesar Cell 3/6

df = train_logs

sentence_counts = {}
total_pause_times = {}
current_sentence_pauses = {}

for index, row in df.iterrows():
    current_id = row['id']

    if current_id not in sentence_counts:
        sentence_counts[current_id] = 0
        total_pause_times[current_id] = []
        current_sentence_pauses[current_id] = 0

    if row['down_event'] == '.':
        sentence_counts[current_id] += 1
        total_pause_times[current_id].append(current_sentence_pauses[current_id])
        current_sentence_pauses[current_id] = 0
    else:
        if index < len(df) - 1:
            next_row = df.iloc[index + 1]
            sentence_pause_time = next_row['down_time'] - row['up_time']
            current_sentence_pauses[current_id] += sentence_pause_time

# mean and standard deviation
for id, times in total_pause_times.items():
    mean = np.mean(times)
    sd = np.std(times, ddof=1)
    results_df = pd.DataFrame({
        'id': [id],
        'Total_Pause_Time_in_sentences': [sum(times)],
        'Mean_Pause_Time_in_sentences': [mean],
        'SD_Pause_Time_in_sentences': [sd]
    })

dhruv = pd.merge(dhruv, results_df, on='id', how='outer')

Unnamed: 0,id,num_backspaces,sum_IKI,mean_IKI,SD_IKI,Total_Pause_Time,Longest_Pause_Time,Mean_Pause_Time,Variance_of_total_pause_time,Num_Long_Pauses,Percentage_of_Longest_Pauses,Total_Pause_Time_in_sentences,Mean_Pause_Time_in_sentences,SD_Pause_Time_in_sentences
0,001519c8,417,1797351.0,702.913962,4295.447374,1566710.0,154136.0,612.714118,18408350.0,60,0.023465,,,
1,0022f953,260,1758219.0,716.47066,4894.385161,1552368.0,145899.0,632.586797,24321250.0,48,0.018772,,,
2,0042269b,439,1766778.0,427.170696,3939.226278,1495492.0,153886.0,361.579304,15479010.0,32,0.012515,,,
3,0059420b,152,1362999.0,875.963368,4247.568454,1234009.0,101690.0,793.06491,19026380.0,31,0.012124,,,
4,0075873a,517,1583920.0,625.807981,3896.405072,1403502.0,110688.0,554.524694,17552740.0,48,0.018772,,,


In [None]:
# Nesar cell 4/6

df = train_logs

pause_times = {}
sentence_counts = {}

for index, row in df.iterrows():
    current_id = row['id']

    if current_id not in pause_times:
        pause_times[current_id] = []
        sentence_counts[current_id] = 0

    if row['down_event'] == '.' and index < len(df) - 1:
        next_row = df.iloc[index + 1]
        if next_row is not None:
            pause = next_row['down_time'] - row['up_time']
            pause = abs(pause)
            pause_times[current_id].append(pause)
            sentence_counts[current_id] += 1

# mean and standard deviation for each "id"
mean_pause_times = {id: np.mean(times) for id, times in pause_times.items()}
std_pause_times = {id: np.std(times, ddof=1) for id, times in pause_times.items()}

results_df = pd.DataFrame({
    'id': list(pause_times.keys()),
    'Total_Pause_Time_Before_Sentences': [sum(times) for times in pause_times.values()],
    'Mean_Pause_Time_Before_Sentences': [mean_pause_times[id] for id in pause_times.keys()],
    'SD_Pause_Time_Before_Sentences': [std_pause_times[id] for id in pause_times.keys()]
})


dhruv = pd.merge(dhruv, results_df, on='id', how='outer')
dhruv.head()

In [None]:
# Nesar cell 5/ 6

df = train_logs

word_counts = {}
total_pause_times = {}
current_word_pauses = {}

for index, row in df.iterrows():
    current_id = row['id']

    if current_id not in word_counts:
        word_counts[current_id] = 0
        total_pause_times[current_id] = []
        current_word_pauses[current_id] = 0

    if row['down_event'] == 'Space':
        word_counts[current_id] += 1
        total_pause_times[current_id].append(current_word_pauses[current_id])
        current_word_pauses[current_id] = 0
    else:
        if index < len(df) - 1:
            next_row = df.iloc[index + 1]
            word_pause_time = next_row['down_time'] - row['up_time']
            current_word_pauses[current_id] += abs(word_pause_time)

# Calculate and print the mean and standard deviation
for id, times in total_pause_times.items():
    mean = np.mean(times)
    sd = np.std(times, ddof=1)
    results_df = pd.DataFrame({
        'id': [id],
        'Total_Pause_Time_within_word': [sum(times)],
        'Mean_Pause_Time_within_word': [mean],
        'SD_Pause_Time_within_word': [sd]
    })



dhruv = pd.merge(dhruv, results_df, on='id', how='outer')

In [None]:
# Nesar cell 6 / 6

df = train_logs

pause_times = {}
word_counts = {}

for index, row in df.iterrows():
    current_id = row['id']

    if current_id not in pause_times:
        pause_times[current_id] = []
        word_counts[current_id] = 0

    if row['down_event'] == 'Space' and index > 0 and index < len(df) - 1:
        previous_row = df.iloc[index - 1]
        next_row = df.iloc[index + 1]

        start = 0
        end = 0
        if previous_row is not None:
            start = previous_row['up_time']

        if next_row is not None:
            end = next_row['down_time']

        pause_times[current_id].append(abs(end - start))
        word_counts[current_id] += 1

# mean and standard deviation
mean_pause_times = {id: np.mean(times) for id, times in pause_times.items()}
std_deviation_pause_times = {id: np.std(times) for id, times in pause_times.items()}

results_df = pd.DataFrame({
    'id': list(pause_times.keys()),
    'Total_Pause_Time_Before_Word': [sum(times) for times in pause_times.values()],
    'Mean_Pause_Time_Before_Word': [mean_pause_times[id] for id in pause_times.keys()],
    'SD_Pause_Time_Before_Word': [std_deviation_pause_times[id] for id in pause_times.keys()]
})

dhruv = pd.merge(dhruv, results_df, on='id', how='outer')
dhruv.head()


In [None]:
dhruv.to_csv("nesar_df.csv", index=False)

In [None]:
dhruv = pd.merge(dhruv, train_scores , on='id', how='outer')
dhruv.to_csv('final_df.csv', index=False)