In [1]:
# Librairies
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import dask.dataframe as dd
import time

from features import (count_awl_words, count_complex_words, count_sentences, 
                      count_syllables, count_words, count_awl_words, 
                      tokenize_essay, lemmatize_essay, remove_stop_words,
                      remove_punctuation, remove_special_words, count_characters,
                      count_dale_chall_difficult_words, count_unique_lemme, get_pos_tags,
                      count_incorrect_words, get_incorrect_words)

In [2]:
# pd.options.display.max_rows = 60
# pd.options.display.max_seq_items = 100

# # pd.reset_option("display.max_rows")
# # pd.reset_option("display.max_seq_items")

# pd.options.display.max_colwidth = None

In [3]:
df_train = pd.read_csv("../data/training_set_rel3.tsv", sep="\t", encoding="ISO-8859-1")
df_train = df_train.dropna(axis=1)
df_test = df_train#.iloc[:5]
df_test = df_test.drop(columns=["rater1_domain1", "rater2_domain1"])

In [4]:
total_steps = 15

pipeline_start = time.time()

start = time.time()
print(f"step 1/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["essay"] = ddf["essay"].apply(remove_special_words)

end = time.time()
print("step 1 completed in ", end - start, " seconds")

start = time.time()
print(f"step 2/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["essay"] = ddf["essay"].apply(remove_punctuation)

end = time.time()
print("step 2 completed in ", end - start, " seconds")

start = time.time()
print(f"step 3/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["essay_tokens"] = ddf["essay"].apply(tokenize_essay)

end = time.time()
print("step 3 completed in ", end - start, " seconds")

start = time.time()
print(f"step 4/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["count_words"] = ddf["essay_tokens"].apply(len)

end = time.time()
print("step 4 completed in ", end - start, " seconds")

start = time.time()
print(f"step 5/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["count_awl_words"] = ddf["essay_tokens"].apply(count_awl_words)

end = time.time()
print("step 5 completed in ", end - start, " seconds")

start = time.time()
print(f"step 6/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["essay_tokens_wo_stopwords"] = ddf["essay_tokens"].apply(remove_stop_words)

end = time.time()
print("step 6 completed in ", end - start, " seconds")

start = time.time()
print(f"step 7/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["lemmatized_essay"] = ddf["essay"].apply(lemmatize_essay)


end = time.time()
print("step 7 completed in ", end - start, " seconds")

start = time.time()
print(f"step 8/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["count_complex_words"] = ddf["lemmatized_essay"].apply(count_complex_words)

end = time.time()
print("step 8 completed in ", end - start, " seconds")

start = time.time()
print(f"step 9/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["count_characters"] = ddf["essay"].apply(count_characters)

end = time.time()
print("step 9 completed in ", end - start, " seconds")

start = time.time()
print(f"step 10/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["dale_chall_words"] = ddf["essay_tokens"].apply(count_dale_chall_difficult_words)

end = time.time()
print("step 10 completed in ", end - start, " seconds")

start = time.time()
print(f"step 11/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["count_unique_lemme"] = ddf["essay_tokens"].apply(count_unique_lemme)

end = time.time()
print("step 11 completed in ", end - start, " seconds")

start = time.time()
df = pd.DataFrame()
print(f"step 12/{total_steps}")
ddf = dd.from_pandas(df_test, npartitions=32)
df = ddf["essay_tokens"].apply(get_pos_tags, meta=(pd.Series(dtype="object")))
df = pd.json_normalize(df)
df_test = df_test.join(df, how="left")
end = time.time()
print("step 12 completed in ", end - start, " seconds")

start = time.time()
print(f"step 13/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["count_incorrect_words"] = df_test["essay_tokens"].apply(count_incorrect_words)
end = time.time()
print("step 13 completed in ", end - start, " seconds")

start = time.time()
print(f"step 14/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["count_sentences"] = df_test["essay"].apply(count_sentences)
end = time.time()
print("step 14 completed in ", end - start, " seconds")

start = time.time()
print(f"step 15/{total_steps}")

ddf = dd.from_pandas(df_test, npartitions=32)
df_test["count_syllables"] = df_test["essay_tokens"].apply(count_syllables)
end = time.time()
print("step 15 completed in ", end - start, " seconds")

pipeline_end = time.time()
print(f"execution of pipeline took {pipeline_end - pipeline_start} seconds")

df_test

step 1/13
step 1 completed in  0.19565224647521973  seconds
step 2/13
step 2 completed in  0.8180291652679443  seconds
step 3/13
step 3 completed in  1.3315224647521973  seconds
step 4/13
step 4 completed in  0.722409725189209  seconds
step 5/13
step 5 completed in  1.0251719951629639  seconds
step 6/13
step 6 completed in  1.0667128562927246  seconds
step 7/13
step 7 completed in  71.39221620559692  seconds
step 8/13
step 8 completed in  64.4795835018158  seconds
step 9/13
step 9 completed in  2.0892388820648193  seconds
step 10/13
step 10 completed in  2.2550771236419678  seconds
step 11/13
step 11 completed in  2.144925832748413  seconds
step 12/13


In [None]:
# jsoned = get_pos_tags(df_test["essay_tokens"][0])

# df = pd.json_normalize(jsoned)
# df

In [None]:
# df = df_test["essay_tokens"].apply(get_pos_tags)
# df = pd.json_normalize(df)
# df

# df_test = df_test.join(df)
# df_test

In [None]:
# df_test["count_incorrect_words"] = df_test["essay_tokens"].apply(count_incorrect_words)
# df_test["get_incorrect_words"] = df_test["essay_tokens"].apply(get_incorrect_words)

# df_test

In [None]:
df_test = df_test.drop(columns=["essay", "essay_id", "essay_set", "essay_tokens", "essay_tokens_wo_stopwords", "lemmatized_essay"])
df_test

Unnamed: 0,domain1_score,count_words,count_awl_words,count_complex_words,count_characters,dale_chall_words,count_unique_lemme,CC,CD,DT,...,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,count_incorrect_words
0,8,340,35,260,1442,73,162,14,0,20,...,2,15,6,17,10,0,1,0,4,11
1,9,409,47,317,1765,107,186,18,5,34,...,5,20,6,21,6,3,0,0,9,13
2,7,272,49,212,1185,71,142,16,2,27,...,0,9,0,25,5,0,5,0,3,1
3,10,484,85,393,2275,144,226,17,0,45,...,17,5,9,21,11,1,3,0,4,21
4,8,462,48,355,2023,88,196,15,5,54,...,2,6,2,26,16,3,1,0,5,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12971,35,792,44,594,3118,138,306,54,11,61,...,46,8,11,34,19,3,4,0,8,1
12972,32,514,25,385,1971,93,203,26,4,44,...,42,9,5,19,6,2,0,0,3,9
12973,40,761,66,583,3235,167,341,35,5,86,...,69,24,26,9,12,9,4,0,4,12
12974,40,543,38,399,2181,141,234,22,4,45,...,44,13,15,9,9,1,6,0,7,2


In [None]:
df_test.columns

Index(['domain1_score', 'count_words', 'count_awl_words',
       'count_complex_words', 'count_characters', 'dale_chall_words',
       'count_unique_lemme', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR',
       'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP',
       'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG',
       'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB',
       'count_incorrect_words'],
      dtype='object')

In [None]:
df_train, df_test = train_test_split(df_test,test_size = 0.2)

In [None]:
X_train = df_train.loc[:, ['count_words', 'count_awl_words',
       'count_complex_words', 'count_characters', 'dale_chall_words',
       'count_unique_lemme', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR',
       'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP',
       'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG',
       'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB',
       'count_incorrect_words']].values
Y_train = df_train.domain1_score.values

In [None]:
X_test = df_test.loc[:, ['count_words', 'count_awl_words',
       'count_complex_words', 'count_characters', 'dale_chall_words',
       'count_unique_lemme', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR',
       'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP',
       'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG',
       'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB',
       'count_incorrect_words']].values
Y_test = df_test.domain1_score.values

In [None]:
regr = LinearRegression()
regr.fit(X_train, Y_train)
print(regr.score(X_test, Y_test))

0.7380748541915985
