In [1]:
! pip install transformers
! pip install datasets

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 8.3 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 56.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 58.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 61.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled P

In [2]:
import pandas as pd
import numpy as np
import pdb
import os
os.chdir('/content/drive/MyDrive/CAMemBERT2')
import re
from datasets import Dataset,DatasetDict,load_dataset,concatenate_datasets
from transformers import AutoTokenizer
from torch import FloatTensor
from torch.cuda import is_available
from math import ceil,floor
import random

In [3]:
class LinkGedDatasetToEssayDataset:

    _map_labels_2_ids = {'c':0,'i':1}
    _map_ids_2_labels = {0:'c',1:'i'}

    def __init__(self,set_type='train'):
        self.errors_count=0
        self.set_type = 'dev'
        essays = pd.read_json(f'data/fce.{set_type}.json',lines=True)
        essays['text'] = essays.text.str.replace('\n',' ')
        essays['text_no_ws'] = essays.text.str.split().str.join('')
        essays['essay_char_len'] = essays['text_no_ws'].apply(len)
        essays['end_word_ind'] = essays['essay_char_len'].cumsum()
        essays['start_ind'] = essays['end_word_ind'] - essays['essay_char_len']
        essays['ind_combined'] = essays.apply(lambda x: list([x['start_ind'],x['end_word_ind']]),axis=1)
        # essays['text_no_ws_from'] = essays['ind_combined'].apply(lambda x:''.join(df.loc[(df['end_word_ind']>x[0])&(df['end_word_ind']<=x[1])].word.tolist()))
        self.essays = essays
        self.essay_col_index = {col:i+1 for i,col in enumerate(essays)}
        ged = pd.DataFrame(pd.read_csv(f'data/fce-public.{set_type}.original.tsv',sep='  ',names=['word']).word.str.split('\t',1).tolist(),columns = ['word','correct'])
        ged['correct'] = ged['correct'].map(self._map_labels_2_ids)
        ged['end_word_ind'] = ged.word.apply(len).cumsum()
        self.ged = ged
        self.all_words_no_ws = ''.join(ged.word.tolist())
        essay_indexes_to_keep,matched_essays = zip(*[(i,re.search(re.escape(ess_no_ws),self.all_words_no_ws)) for i,ess_no_ws in enumerate(essays.text_no_ws.tolist()) if ess_no_ws in self.all_words_no_ws ])
        essays_from_ged = [' '.join(self.ged.loc[(self.ged['end_word_ind']>m.start())& (self.ged['end_word_ind']<=m.end())]['word'].tolist()) for m in matched_essays]
        tags_from_ged = [self.ged.loc[(self.ged['end_word_ind']>m.start())& (self.ged['end_word_ind']<=m.end())]['correct'].tolist() for m in matched_essays]
        essays = essays.iloc[list(essay_indexes_to_keep)].reset_index()
        self.updated_df = pd.concat([pd.DataFrame({'essays':essays_from_ged,'tags':tags_from_ged}),essays[['answer-s','script-s','id']]],axis=1)
    
    def get_updated_df(self):
        return self.updated_df

class CreateHuggingFaceDictGed:

    _set_types = ['train','test','dev']
    _cols_to_keep = ['attention_mask','labels','input_ids','script_scores']

    def __init__(self,pretrained_model= 'distilroberta-base',max_length=512):
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        combined_df = pd.concat([self.add_col(LinkGedDatasetToEssayDataset(set_type).get_updated_df(),'set_type',set_type) for set_type in self._set_types],axis=0)
        df = combined_df.rename(columns={'essays':'text','script-s':'script_scores','tags':'labels'})
        df['script_scores'] = df['script_scores'].astype(int)        
        df = df.groupby('id').agg({'text':list,'labels':list,'script_scores':list,'set_type':list})
        df[ 'text' ] = df[ 'text' ].str.join(' ')
        df['labels'] = df['labels'].apply(lambda x : x[0] + x[1] if len(x)>1 else x[0])
        df[ 'script_scores' ] = df[ 'script_scores' ].apply(lambda x : x[0])
        df[ 'set_type' ] = df[ 'set_type' ].apply(lambda x : x[0])
        df = df.reset_index()[['text','labels','script_scores','set_type']]
        self.df_for_sent = df
        dataset_dict = DatasetDict({set_type:Dataset.from_pandas(df.groupby('set_type').get_group(set_type)) for set_type in self._set_types})
        dataset_dict = dataset_dict.map(self.extend_labels_for_tokenizer).map(self.preprocessing_func)
        cols_to_drop = set(dataset_dict.column_names['train']) - set(self._cols_to_keep)
        self.dataset_dict = dataset_dict.remove_columns(list(cols_to_drop))
        self.set_weights()

    def add_col(self,df,col,val):
        df[col] = val
        return df

    def get_df(self):
        return self.df_for_sent

    def get_dataset_dict(self):
        return self.dataset_dict

    def get_weights(self):
        return self.class_weights

    def extend_labels_for_tokenizer(self,example):
        tokens,labels = example['text'].split(),example['labels']
        r_tags , token2word = [] , []
        count = 0
        for index, token in enumerate( self.tokenizer.tokenize( ' '.join( tokens ) , truncation = True , padding = False , add_special_tokens = False , max_length = self.max_length ) ):

            if ( ( ( ( token.startswith( "Ġ" ) == False and index != 0 ) or ( token in tokens[ index - count - 1 ].lower() and index - count - 1 >= 0 ) ) and self.tokenizer.sep_token == '</s>' ) 
                or ( ( token.startswith( "##" ) or ( token in tokens[index - count - 1].lower() and index - count - 1 >= 0 ) ) and self.tokenizer.sep_token == '[SEP]' ) ):

                r_tags.append( -100 )
                
                count += 1

            else:

                try:
                    r_tags.append(labels[index - count])
                except:
                    pdb.set_trace()

            token2word.append( index - count )
        return {'labels':np.pad( r_tags , ( 0 , 512 - len( r_tags ) ) , 'constant' , constant_values = ( 0 , -100 ) )[:self.max_length]}

    def preprocessing_func(self,example):
        return self.tokenizer( example['text'] , truncation=True , padding = 'max_length' , max_length = self.max_length )

    def set_weights(self):
        dataset = self.get_dataset_dict()
        padding,n_c,n_i = np.unique(np.concatenate(dataset['train']['labels']),return_counts=True)[1]
        class_weights = FloatTensor([(n_c + n_i)/(2.0 * n_c),(n_c + n_i)/(2.0 * n_i)]).to('cuda' if is_available() else 'cpu')
        self.class_weights = class_weights

dataset_dict = CreateHuggingFaceDictGed().get_dataset_dict()

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



  0%|          | 0/1057 [00:00<?, ?ex/s]

  0%|          | 0/97 [00:00<?, ?ex/s]

  0%|          | 0/79 [00:00<?, ?ex/s]

  0%|          | 0/1057 [00:00<?, ?ex/s]

  0%|          | 0/97 [00:00<?, ?ex/s]

  0%|          | 0/79 [00:00<?, ?ex/s]

In [4]:
dataset_dict = dataset_dict.remove_columns(['labels'])
dataset_dict = dataset_dict.rename_column('script_scores','labels')

In [5]:
from transformers import AutoModelForSequenceClassification,TrainingArguments,Trainer
from scipy.stats import spearmanr
from sklearn.metrics import cohen_kappa_score
import numpy as np
import torch.nn.functional as F

model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base',num_labels=1)
modules = [model.roberta.embeddings, *model.roberta.encoder.layer[:6]] #freeze all layers
for module in modules:
    for param in module.parameters():
        param.requires_grad = False

def compute_metrics(p):
    logits, labels = p.predictions,p.label_ids
    logits = np.rint(((logits)).flatten())
    labels = np.rint(((labels)).flatten())
    rmse = np.sqrt(np.mean((logits-labels)**2))
    pearson = np.corrcoef(logits,labels)[0,1]
    coef, p = spearmanr(logits, labels)
    kappa = cohen_kappa_score(logits, labels)
    return {
            "rmse": rmse,
            "pearson": pearson,
            "spearman" : coef,
            "kappa":kappa
          }

batch_size = 8
label_names = ['labels']
args = TrainingArguments(
        output_dir = '/content',
        save_total_limit = 1,
        evaluation_strategy = "epoch",
        learning_rate = 5e-4,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        remove_unused_columns=False,
        num_train_epochs = 30,
        weight_decay = 0.01,
        save_strategy = 'epoch',
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        label_names=label_names
    )


class RegressionTrainer(Trainer):
    def compute_loss(self,model,inputs,return_outputs = False):


        bert_ouput = model(**inputs).logits
        batch_size = inputs['input_ids'].size()[0]

        predictions = bert_ouput.view(batch_size)
        predictions_for_loss = bert_ouput.view(batch_size).float()
            
        labels_for_loss = inputs['labels'].float()

        loss = F.mse_loss(predictions_for_loss,labels_for_loss)

        return (loss,(loss,predictions.int())) if return_outputs else loss

trainer = RegressionTrainer(
    model=model,
    args=args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['dev'],
    compute_metrics=compute_metrics
    )
trainer.train()

Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.bias

Epoch,Training Loss,Validation Loss,Rmse,Pearson,Spearman,Kappa
1,No log,31.488317,224.657264,0.208458,0.199498,0.044857
2,No log,28.858309,214.180806,0.457432,0.469786,-0.014067
3,No log,32.809269,221.113408,0.557075,0.577647,0.076659
4,58.630700,28.400805,203.563196,0.611745,0.617536,0.056797
5,58.630700,25.745996,194.089892,0.643962,0.636864,0.027692
6,58.630700,30.549595,210.027123,0.655866,0.65284,-0.018836
7,58.630700,35.291904,227.167479,0.642638,0.632567,-0.002708
8,18.815400,24.755314,188.746699,0.682266,0.69874,0.029352
9,18.815400,30.587173,206.232019,0.703493,0.714802,-0.018836
10,18.815400,28.721634,202.215576,0.685398,0.691012,-0.020048


***** Running Evaluation *****
  Num examples = 79
  Batch size = 8
Saving model checkpoint to /content/checkpoint-133
Configuration saved in /content/checkpoint-133/config.json
Model weights saved in /content/checkpoint-133/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 79
  Batch size = 8
Saving model checkpoint to /content/checkpoint-266
Configuration saved in /content/checkpoint-266/config.json
Model weights saved in /content/checkpoint-266/pytorch_model.bin
Deleting older checkpoint [/content/checkpoint-133] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 79
  Batch size = 8
Saving model checkpoint to /content/checkpoint-399
Configuration saved in /content/checkpoint-399/config.json
Model weights saved in /content/checkpoint-399/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 79
  Batch size = 8
Saving model checkpoint to /content/checkpoint-532
Configuration saved in /content/checkpoint-532/config.json
Model weight

KeyboardInterrupt: ignored

In [None]:
class LinkGedDatasetToEssayDataser:

    def __init__(self,set_type='train'):
        self.errors_count=0
        self.set_type = 'dev'
        essays = pd.read_json(f'data/fce.{set_type}.json',lines=True)
        essays['text'] = essays.text.str.replace('\n',' ')
        essays['text_no_ws'] = essays.text.str.split().str.join('')
        essays['essay_char_len'] = essays['text_no_ws'].apply(len)
        essays['end_word_ind'] = essays['essay_char_len'].cumsum()
        essays['start_ind'] = essays['end_word_ind'] - essays['essay_char_len']
        essays['ind_combined'] = essays.apply(lambda x: list([x['start_ind'],x['end_word_ind']]),axis=1)
        # essays['text_no_ws_from'] = essays['ind_combined'].apply(lambda x:''.join(df.loc[(df['end_word_ind']>x[0])&(df['end_word_ind']<=x[1])].word.tolist()))
        self.essays = essays
        self.essay_col_index = {col:i+1 for i,col in enumerate(essays)}
        ged = pd.DataFrame(pd.read_csv(f'data/fce-public.{set_type}.original.tsv',sep='  ',names=['word']).word.str.split('\t',1).tolist(),columns = ['word','correct'])
        ged['end_word_ind'] = ged.word.apply(len).cumsum()
        self.ged = ged
        self.all_words_no_ws = ''.join(ged.word.tolist())
        self.find_differences()
        essays_from_ged = [' '.join(self.ged.loc[(self.ged['end_word_ind']>ind[0])& (self.ged['end_word_ind']<=ind[1])]['word'].tolist()) for ind in self.new_indexes]
        tags_from_ged = [self.ged.loc[(self.ged['end_word_ind']>ind[0])& (self.ged['end_word_ind']<=ind[1])]['correct'].tolist() for ind in self.new_indexes]
        self.updated_df = pd.DataFrame({'essays':essays_from_ged,'tags':tags_from_ged})

    def get_updated_df(self):
        return self.updated_df

    def find_differences(self):
        self.new_text = []
        self.new_indexes = []
        self.new_indexes_errors = []
        for i,row in enumerate(self.essays.itertuples()):
            essay_errors_count = 0
            essay_no_ws = row[self.essay_col_index['text_no_ws']]
            char_len = row[self.essay_col_index['essay_char_len']]
            start_ind,end_ind = row[self.essay_col_index['start_ind']]-self.errors_count,row[self.essay_col_index['end_word_ind']]-self.errors_count
            ged_essay = self.all_words_no_ws[start_ind:end_ind]
            self.new_indexes.append([start_ind,end_ind])
            try:
                np.all(np.array(list(essay_no_ws))==np.array(list(ged_essay)))
            except:
                if self.set_type=='dev':
                    essay_no_ws = essay_no_ws.replace("''","")
            if np.all(np.array(list(essay_no_ws))==np.array(list(ged_essay))):
                self.new_text.append(essay_no_ws)
            else:
                
                current_error = min(np.nonzero(np.invert(np.array(list(essay_no_ws))==np.array(list(ged_essay))))[0])
                if np.all(essay_no_ws[current_error+1:char_len] == ged_essay[current_error:char_len-1]):
                    tmp_text = essay_no_ws[current_error+1:char_len]
                    combined_text = essay_no_ws[:current_error]+tmp_text
                    self.new_text.append(combined_text)
                    self.errors_count+=1
                else:
                    correct_text = ged_essay[:current_error]
                    a = True
                    while a == True:
                        tmp_ess = essay_no_ws[current_error+essay_errors_count+1:char_len]
                        tmp_ged = ged_essay[current_error:char_len-essay_errors_count-1]
                        if tmp_ess == tmp_ged:
                            tmp_text = essay_no_ws[current_error+1:char_len]
                            correct_text = correct_text + tmp_ged
                            self.new_text.append(correct_text)
                            self.errors_count+=essay_errors_count
                            a = False
                            break

                        else:
                            char_to_next_error = min(np.nonzero(np.invert(np.array(list(tmp_ess))==np.array(list(tmp_ged))))[0])
                            current_error += char_to_next_error
                            correct_text = correct_text + tmp_ged[:char_to_next_error]
                            essay_errors_count+=1

                

d = LinkGedDatasetToEssayDataser('dev')

  app.launch_new_instance()


In [None]:
class LinkGedDatasetToEssayDataset:

    def __init__(self,set_type='train'):
        self.errors_count=0
        self.set_type = 'dev'
        essays = pd.read_json(f'data/fce.{set_type}.json',lines=True)
        essays['text'] = essays.text.str.replace('\n',' ')
        essays['text_no_ws'] = essays.text.str.split().str.join('')
        essays['essay_char_len'] = essays['text_no_ws'].apply(len)
        essays['end_word_ind'] = essays['essay_char_len'].cumsum()
        essays['start_ind'] = essays['end_word_ind'] - essays['essay_char_len']
        essays['ind_combined'] = essays.apply(lambda x: list([x['start_ind'],x['end_word_ind']]),axis=1)
        # essays['text_no_ws_from'] = essays['ind_combined'].apply(lambda x:''.join(df.loc[(df['end_word_ind']>x[0])&(df['end_word_ind']<=x[1])].word.tolist()))
        self.essays = essays
        self.essay_col_index = {col:i+1 for i,col in enumerate(essays)}
        ged = pd.DataFrame(pd.read_csv(f'data/fce-public.{set_type}.original.tsv',sep='  ',names=['word']).word.str.split('\t',1).tolist(),columns = ['word','correct'])
        ged['end_word_ind'] = ged.word.apply(len).cumsum()
        self.ged = ged
        self.all_words_no_ws = ''.join(ged.word.tolist())
        if set_type=='test':
            essay_indexes_to_keep,matched_essays = zip(*[(i,re.search(re.escape(ess_no_ws),self.all_words_no_ws)) for i,ess_no_ws in enumerate(essays.text_no_ws.tolist()) if ess_no_ws in self.all_words_no_ws ])
            essays_from_ged = [' '.join(self.ged.loc[(self.ged['end_word_ind']>m.start())& (self.ged['end_word_ind']<=m.end())]['word'].tolist()) for m in matched_essays]
            tags_from_ged = [self.ged.loc[(self.ged['end_word_ind']>m.start())& (self.ged['end_word_ind']<=m.end())]['correct'].tolist() for m in matched_essays]
            essays = essays.iloc[list(essay_indexes_to_keep)].reset_index()
        else:
            self.find_differences()
            essays_from_ged = [' '.join(self.ged.loc[(self.ged['end_word_ind']>ind[0])& (self.ged['end_word_ind']<=ind[1])]['word'].tolist()) for ind in self.new_indexes]
            tags_from_ged = [self.ged.loc[(self.ged['end_word_ind']>ind[0])& (self.ged['end_word_ind']<=ind[1])]['correct'].tolist() for ind in self.new_indexes]
        self.updated_df = pd.concat([pd.DataFrame({'essays':essays_from_ged,'tags':tags_from_ged}),essays[['answer-s','script-s']]],axis=1)

    def get_updated_df(self):
        return self.updated_df

    def find_differences(self):
        self.new_text = []
        self.new_indexes = []
        self.new_indexes_errors = []
        for i,row in enumerate(self.essays.itertuples()):
            essay_errors_count = 0
            essay_no_ws = row[self.essay_col_index['text_no_ws']]
            char_len = row[self.essay_col_index['essay_char_len']]
            start_ind,end_ind = row[self.essay_col_index['start_ind']]-self.errors_count,row[self.essay_col_index['end_word_ind']]-self.errors_count
            ged_essay = self.all_words_no_ws[start_ind:end_ind]
            self.new_indexes.append([start_ind,end_ind])
            try:
                np.all(np.array(list(essay_no_ws))==np.array(list(ged_essay)))
            except:
                if self.set_type=='dev':
                    essay_no_ws = essay_no_ws.replace("''","")
            if np.all(np.array(list(essay_no_ws))==np.array(list(ged_essay))):
                self.new_text.append(essay_no_ws)
            else:
                
                current_error = min(np.nonzero(np.invert(np.array(list(essay_no_ws))==np.array(list(ged_essay))))[0])
                if np.all(essay_no_ws[current_error+1:char_len] == ged_essay[current_error:char_len-1]):
                    tmp_text = essay_no_ws[current_error+1:char_len]
                    combined_text = essay_no_ws[:current_error]+tmp_text
                    self.new_text.append(combined_text)
                    self.errors_count+=1
                else:
                    correct_text = ged_essay[:current_error]
                    a = True
                    while a == True:
                        tmp_ess = essay_no_ws[current_error+essay_errors_count+1:char_len]
                        tmp_ged = ged_essay[current_error:char_len-essay_errors_count-1]
                        if tmp_ess == tmp_ged:
                            tmp_text = essay_no_ws[current_error+1:char_len]
                            correct_text = correct_text + tmp_ged
                            self.new_text.append(correct_text)
                            self.errors_count+=essay_errors_count
                            a = False
                            break

                        else:
                            char_to_next_error = min(np.nonzero(np.invert(np.array(list(tmp_ess))==np.array(list(tmp_ged))))[0])
                            current_error += char_to_next_error
                            correct_text = correct_text + tmp_ged[:char_to_next_error]
                            essay_errors_count+=1

train = LinkGedDatasetToEssayDataset('train')                
dev = LinkGedDatasetToEssayDataset('dev')
test = LinkGedDatasetToEssayDataset('test')

  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()


In [None]:
train.updated_df

Unnamed: 0,essays,tags,answer-s,script-s
0,"Dear Sir or Madam , I am writing in order to e...","[c, c, c, c, c, c, c, c, c, c, c, c, c, c, i, ...",4.3,31
1,"Unfortunately , Pat was n't very good at keepi...","[c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, ...",5.1,31
2,"10 June 2000 Dear Manager , I would like to co...","[c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, ...",3.3,29
3,Fashion of the future People will wear this ki...,"[c, c, c, c, c, c, c, c, c, c, c, c, c, i, c, ...",3.3,29
4,DECEMBER 12TH PRINCIPAL MR . ROBERTSON DEAR SI...,"[c, c, c, c, c, c, c, c, c, c, c, c, i, c, c, ...",2.3,28
...,...,...,...,...
2111,If you ask at twenty womans what is their favo...,"[c, c, c, i, c, i, c, i, i, i, i, c, i, c, c, ...",3.3,29
2112,Dear Mr. Smith ; I am writing this letter to m...,"[c, c, c, i, c, c, c, c, c, c, i, c, c, i, c, ...",3.1,24
2113,TECHNOLOGICAL ADVANTAGES Technology is the thi...,"[c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, ...",3.2,24
2114,"Dear Sir or Madam , I am writing to express my...","[c, c, c, c, c, c, c, c, c, c, c, c, i, c, c, ...",4.3,34


In [None]:
max(train.essays.text.str.len())

2537

In [None]:
d.all_words_no_ws[707:]

'PowerOfComputurs.Howmuchcanacomputeraffectyourlife?Ithinkiteffectsmorethanyouthink.WhenIwasachildIalwayswantedtogetthelyricsofthesongsthatIliked,butitwasnearlyimpossibletohavethem.Ifeltlackofthem.IfirstboughtacomputerasIwasinuniversity.ItwassonecessaryformebecauseIwantedtobeacomputerprogrammer.ThattimeImetthemagicworldofcomputers.Linkingtotheinternetwassopopular.Internetwassomethingamazingforme.Canyouthinkyouhaveeveryinformationyouneedwheneveryouwant?Canyouthinkyoucontactwithanyonewhoisfromanycountrywheneveryouwant?Imeetalotofpeopleoninternetanditreallyinterestme.Ilearnalotabouttheirculturebyspeakingthemoninternet.Iwasn\'tabletoimaginethisbefore.Inadditionthis,todaycompaniescandonothingwithoutacomputer.Everythinginthisworldworkswithcomputersfromnowon.Computerschangedmylifestyle,myopinionabouttheworld.Andtheyeffecteveryone\'slifedirectlyorindirectly.12thJune2001DarMrs.MariaSmith,IamverygladtohelpyouabouttheInternationalStudentConference.IhopethatourfriendRichardBrowndoesn\'thaveanyseri

In [None]:
d.essays.iloc[86:]

Unnamed: 0,text,age,q,script-s,edits,l1,id,answer-s,text_no_ws,essay_char_len,end_word_ind,start_ind,ind_combined
86,"Dear Mrs Smith I have received your letter, w...",21-25,1,26,"[[0, [[61, 97, None, 'CE'], [121, 131, '', 'UN...",tr,TE44*0100*2001*01,3.3,"DearMrsSmithIhavereceivedyourletter,whichwasab...",707,73407,72700,"[72700, 73407]"
87,Power Of Computurs. How much can a computer ...,21-25,3,26,"[[0, [[0, 0, 'The', 'MD'], [75, 82, 'affects',...",tr,TE44*0100*2001*01,3.2,PowerOfComputurs.Howmuchcanacomputeraffectyour...,859,74266,73407,"[73407, 74266]"
88,"Dear Mrs M. Smith, I am writing in order to r...",21-25,1,31,"[[0, [[188, 192, '', 'UA'], [259, 271, 'number...",it,TE45*0100*2001*01,5.1,"DearMrsM.Smith,Iamwritinginordertoreplytoyourl...",854,75120,74266,"[74266, 75120]"
89,Have you ever thought to live without your ca...,21-25,3,31,"[[0, [[22, 22, 'of', 'MT'], [23, 30, 'living',...",it,TE45*0100*2001*01,3.3,Haveyoueverthoughttolivewithoutyourcar?Howcany...,877,75997,75120,"[75120, 75997]"
90,"Dear Mrs Maria Smith, Thank you for your lett...",26-30,1,18,"[[0, [[81, 81, 'a', 'MD'], [146, 152, '', 'UT'...",ko,TE46*0100*2001*01,1.3,"DearMrsMariaSmith,ThankyouforyourletterwhichIr...",756,76753,75997,"[75997, 76753]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,"The building called in French ""Le Centre Pompi...",21-25,2,27,"[[0, [[149, 149, 'any', 'MQ'], [183, 185, 'abo...",fr,TE95*0100*2001*01,4.3,"ThebuildingcalledinFrench""LeCentrePompidou""isa...",580,160331,159751,"[159751, 160331]"
190,Dear Mrs Smith. I am writing in response to y...,21-25,1,30,"[[0, [[146, 148, 'at', 'RT'], [227, 227, 'the'...",ru,TE96*0100*2001*01,5.1,DearMrsSmith.Iamwritinginresponsetoyourletteri...,899,161230,160331,"[160331, 161230]"
191,"As it is widely known, lots of different and i...",21-25,3,30,"[[0, [[3, 5, '', 'UA'], [182, 183, '', 'UP'], ...",ru,TE96*0100*2001*01,2.3T,"Asitiswidelyknown,lotsofdifferentandimportantt...",992,162222,161230,"[161230, 162222]"
192,"Neuchâtel, 12th June 2001 Dear Mrs Smith, I ...",26-30,1,34,"[[0, [[44, 51, 'Thank', 'UA'], [116, 116, 'you...",fr,TE97*0100*2001*01,4.3,"Neuchâtel,12thJune2001DearMrsSmith,Ithankyoufo...",863,163085,162222,"[162222, 163085]"
