## Import libraries & data

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import re
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
final_df = pd.read_csv('final_df_20211027.csv')

In [3]:
final_df.sample(5)

Unnamed: 0,correct,incorrect
956262,( I am going to omit why this accident occured...,( I am going to omit why this accident occured...
604001,I work at a Hotel in Japan .,I work at a Hotel in Japan .
709700,I 'm going to talk about it here . hehe,I 'm going to talk about it here . hehe
20014,My wife took part in a mini - car racing game .,My wife took part of a mini - car racing game .
176213,I love to feel the fresh air on my face .,I love to feel the fresh air in my face n .


In [4]:
final_df.shape

(1037561, 2)

### Adding length features

In [5]:
final_df['correct_char_count'] = final_df['correct'].astype('str').apply(lambda x:len(x))
final_df['incorrect_char_count'] = final_df['incorrect'].astype('str').apply(lambda x:len(x))

In [6]:
final_df['correct_word_count'] = final_df['correct'].astype('str').apply(lambda x:len(x.split()))
final_df['incorrect_word_count'] = final_df['incorrect'].astype('str').apply(lambda x:len(x.split()))

In [7]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
606077,was the oldest girl in her class .,was an oldest girl in her class .,34,33,8,8
314705,I would be so happy if you guys could correct ...,I 'm so happy if you guys correct them when yo...,74,62,18,16
640984,I arrived at Shinjuku at about noon .,I arrived at Shinjuku about at noon .,37,37,8,8
650381,pretty kittens or puppies - for girls ;,pretty kitten or the puppy - for girls ;,39,40,8,9
368491,Yin was quite taken by surprise because lots o...,Yin was quite taken into surprise that lots of...,118,93,22,19


## Preprocessing

### Removing Missing/NA 

In [8]:
pd.DataFrame(final_df.isna().sum(),columns=['missing_count'])

Unnamed: 0,missing_count
correct,1
incorrect,1
correct_char_count,0
incorrect_char_count,0
correct_word_count,0
incorrect_word_count,0


In [9]:
final_df[final_df.isna().any(axis=1)]

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
222211,,,3,3,1,1


In [10]:
final_df = final_df.dropna().reset_index(drop=True)

In [11]:
final_df.shape

(1037560, 6)

In [12]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
623623,And we have to pay 30 . 000 yen for a celebrat...,And we have to pay 30 . 000 yen for a celebrat...,81,81,18,18
46698,"I feel very bored , and I do n't know the reas...","I feel very boring , I do n't know the reason ...",104,99,25,23
58419,I will go to the area .,I will go to the area .,23,23,7,7
800367,But I got to feel that Mali was very close to ...,But I got to feel Mali very close to me hearin...,76,61,17,14
1010628,My 150th English Gogyohka,My 150th English Gogyohka,25,25,4,4


### Keep unique sentence pairs

In [13]:
print(f"total number of duplicate pairs: {len(final_df[final_df['correct']==final_df['incorrect']])}")

total number of duplicate pairs: 539201


In [14]:
print(f"total number of duplicate pairs: {len(final_df[final_df['correct']==final_df['incorrect']])}")

total number of duplicate pairs: 539201


In [15]:
final_df[final_df['correct']==final_df['incorrect']].sample(10)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
310129,I went to a department store to buy some ingre...,I went to a department store to buy some ingre...,80,80,16,16
451215,"It was not so good , but cheap .","It was not so good , but cheap .",32,32,9,9
208323,I 'm a university student who is studying Engl...,I 'm a university student who is studying Engl...,69,69,14,14
437266,StudyStudyStudy,StudyStudyStudy,15,15,1,1
672164,19th birthday,19th birthday,13,13,2,2
188392,"Unfortunately , it was closed .","Unfortunately , it was closed .",31,31,6,6
869416,"However , I want to work hard .","However , I want to work hard .",31,31,8,8
377264,I must study English hard ! ! ! ! !,I must study English hard ! ! ! ! !,35,35,10,10
591978,I moved 8 medaka ( Japanese killifish ) out of...,I moved 8 medaka ( Japanese killifish ) out of...,74,74,16,16
23547,"I determined to run tomorrow morning , but whe...","I determined to run tomorrow morning , but whe...",121,121,26,26


In [16]:
final_df = final_df[final_df['correct']!=final_df['incorrect']]

In [17]:
final_df.shape

(498359, 6)

In [18]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
557318,I need only people who have something to do wi...,I need only people who have something to do wi...,77,76,17,17
473885,I got rid of my cold at last .,I got a rid of my cold at last .,30,32,9,10
394981,The noodle restaurant which we tried to go to ...,The noodle restaurant which we tried to go to ...,147,145,31,30
250362,So we are going to meet downtown at noon .,So we are going to meet in downtown at noon .,42,45,10,11
952618,I wonder if I could pass the Eiken . . .,I wonder I could pass the Eiken . . .,40,37,11,10


### Remove Duplicates

In [19]:
print(f'total number of duplicates: {final_df.duplicated().sum()}')

total number of duplicates: 2021


In [20]:
final_df[final_df.duplicated(keep=False)].sort_values('correct')

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
717379,"( I seriously want to escape , all the way , t...","( I seriously want to escape , all the way , t...",88,85,19,19
1027462,"( I seriously want to escape , all the way , t...","( I seriously want to escape , all the way , t...",88,85,19,19
802135,: - ),: - (,5,5,3,3
800388,: - ),: - (,5,5,3,3
161743,A : How much did it cost ?,A : How much does is cost ?,26,27,8,8
...,...,...,...,...,...,...
350827,to be continued . . .,to be continue . . .,21,20,6,6
17343,to be continued . . .,to be continue . . .,21,20,6,6
633235,to be continued . . .,to be continue . . .,21,20,6,6
767284,today was a bad day .,today is a bad day .,21,20,6,6


In [21]:
final_df = final_df.drop_duplicates().reset_index(drop=True)

In [22]:
final_df.shape

(496338, 6)

In [23]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
313694,He booked accommodation and a car .,He booked an accomodation and a car .,35,37,7,8
3993,I partly agree with it .,I partly agree it .,24,19,6,5
408749,I sometimes play Go .,I sometimes playing in Go .,21,27,5,6
377102,So I have to buy a new one,So I have to buy bag for school .,26,33,8,9
157179,"After I came back home , I ate lunch and I ate...","After I came back home , I ate lunch and I ate...",100,98,22,22


### Remove Small sentences



In [24]:
final_df[final_df['incorrect_char_count']<2].shape

(5, 6)

In [25]:
final_df = final_df[final_df['incorrect_char_count']>2].reset_index(drop=True)

In [26]:
final_df.shape

(496326, 6)

In [27]:
final_df[final_df['correct_char_count']<2].shape

(27, 6)

In [28]:
final_df[final_df['correct_char_count']<2].sample(10)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
209276,.,There is Mt .,1,13,1,4
91009,!,Do ours best !,1,14,1,4
10489,.,took some medicine .,1,20,1,4
42876,.,At first .,1,10,1,3
398148,.,them .,1,6,1,2
164087,?,' ? ?,1,5,1,3
30259,.,you .,1,5,1,2
34956,",","Maiko Nakai ,",1,13,1,3
153477,.,For Tech support .,1,18,1,4
251777,.,out .,1,5,1,2


In [29]:
final_df = final_df[final_df['correct_char_count']>2].reset_index(drop=True)

In [30]:
final_df.shape

(496287, 6)

### Clean text

In [31]:
#https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have","n\'t":" not","\'re":" are","\'s": " is","\'d":" would",
                     "\'ll": " will","\'t":" not","\'ve": " have","\'m":" am"}


# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [32]:
# https://stackoverflow.com/a/47091490/4084039
def clean(text):
    text = re.sub('\s*\<.*?\>\s', '', text)
    text = re.sub('\s*\(.*?\)\s', '', text)
    text = re.sub('\s*\[.*?\]\s', '', text)
    text = re.sub('\s*\{.*?\}\s', '', text)
    text = re.sub("[-+@#^/|*(){}$~<>=_%:;]","",text)
    text = text.replace("\\","")
    text = re.sub("\[","",text)
    text = re.sub("\]","",text)
    text = re.sub("\<","",text)
    text = re.sub("\>","",text)
    text = re.sub("\(","",text)
    text = re.sub("\)","",text)
    text = re.sub("[0-9]","",text)
    text = ' '.join(text.split())
    return text

In [33]:
final_df['correct'] = final_df['correct'].progress_apply(clean)
final_df['correct'] = final_df['correct'].progress_apply(expand_contractions)

  0%|          | 0/496287 [00:00<?, ?it/s]

  0%|          | 0/496287 [00:00<?, ?it/s]

In [34]:
final_df['incorrect'] = final_df['incorrect'].progress_apply(clean)
final_df['incorrect'] = final_df['incorrect'].progress_apply(expand_contractions)

  0%|          | 0/496287 [00:00<?, ?it/s]

  0%|          | 0/496287 [00:00<?, ?it/s]

In [35]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
245081,Korean people say that eating dogs is a cultur...,Korean people says eating dog culinary is a cu...,56,60,11,11
96659,I can sleep for as long as I want .,I can sleep whenever I want .,35,29,10,7
252856,"When I started job hunting , I was concerned .","When I started my job hunting , I was concerned .",46,49,10,11
421142,"At first , I was a worried , but I was relieve...","I was a little scary , but I was relieved to f...",95,89,23,21
449893,It was so nice .,It is so nice .,16,15,5,5


In [36]:
final_df.isna().sum()

correct                 0
incorrect               0
correct_char_count      0
incorrect_char_count    0
correct_word_count      0
incorrect_word_count    0
dtype: int64

In [38]:
final_df.to_csv('final_df_preprocessed_2021111201.csv',index=False)