In [40]:
import pandas as pd
import numpy as np

In [41]:
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# run only if not have that

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
sw = stopwords.words('english') + list(punctuation)

In [43]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [44]:
cv = CountVectorizer(tokenizer=word_tokenize, ngram_range=(1,3))

In [45]:
df_summaries = pd.read_csv("../data/summaries_train.csv", index_col="prompt_id")
df_prompt = pd.read_csv("../data/prompts_train.csv", index_col="prompt_id")

In [46]:
df_prompt.head()

Unnamed: 0_level_0,prompt_question,prompt_title,prompt_text
prompt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [47]:
df_summaries.head()

Unnamed: 0_level_0,student_id,text,content,wording
prompt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
814d6b,000e8c3c7ddb,The third wave was an experimentto see how peo...,0.205683,0.380538
ebad26,0020ae56ffbf,They would rub it up with soda to make the sme...,-0.548304,0.506755
3b9047,004e978e639e,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3b9047,005ab0199905,The highest class was Pharaohs these people we...,-0.210614,-0.471415
814d6b,0070c9e7af47,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [48]:
combine_df = df_summaries.join(df_prompt, how="outer",lsuffix="prompt_id")
combine_df.head()

Unnamed: 0_level_0,student_id,text,content,wording,prompt_question,prompt_title,prompt_text
prompt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
39c16e,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
39c16e,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
39c16e,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
39c16e,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
39c16e,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...


In [49]:
def transformText(texts):
   lowered = [text.lower() for text in texts]
   return lowered

In [50]:
def getCorr(in_text, out_text):
    result = []
    for i, o in zip(in_text, out_text):
        out_token = word_tokenize(o)
        in_token = word_tokenize(i)
        oot = len([word for word in out_token if word not in in_token]) # out of topic
        it = len([word for word in out_token if word in in_token]) # in topic
        ut = len([word for word in in_token if word in out_token]) # used topic
        lo = len(out_token) # length out text
        result.append([oot, it, ut, lo])

    return np.array(result)

In [51]:
def getSimilarity(df):
    input_t = cv.transform(df.input.values)
    output_t = cv.transform(df.text.values)
    data = []
    for i,_ in enumerate(input_t):
        data.append(cosine_similarity(output_t[i],input_t[i])[0])
    return np.array(data)

In [52]:
def getDataset(df):
    df['input'] = df.prompt_title + " " + df.prompt_question + " " +df.prompt_text
    df.index = df.student_id
    df.drop(columns=['prompt_title','prompt_question','prompt_text','student_id'], inplace=True)
    df['text'] = transformText(df.text.values)
    df['input'] = transformText(df.input.values)
    df[['oot','it','ut','lo']] = getCorr(df.input.values, df.text.values)
    return df


In [53]:
combine_df = getDataset(combine_df)

In [54]:
cv.fit(combine_df.input)



In [55]:
combine_df['similarity'] = getSimilarity(combine_df)
combine_df.head()

Unnamed: 0_level_0,text,content,wording,input,oot,it,ut,lo,similarity
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,on tragedy summarize at least 3 elements of an...,9,50,226,59,0.316549
0086ef22de8f,the three elements of an ideal tragedy are: h...,-0.970237,-0.417058,on tragedy summarize at least 3 elements of an...,8,22,253,30,0.499286
0094589c7a22,aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,on tragedy summarize at least 3 elements of an...,28,46,283,74,0.407084
00cd5736026a,one element of an ideal tragedy is having a co...,0.088882,-0.59471,on tragedy summarize at least 3 elements of an...,14,47,244,61,0.439741
00d98b8ff756,the 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,on tragedy summarize at least 3 elements of an...,23,40,201,63,0.438908


In [56]:
combine_df[['oot','ut','it','similarity','lo','content','wording']].corr()

Unnamed: 0,oot,ut,it,similarity,lo,content,wording
oot,1.0,0.21893,0.644359,0.489355,0.775695,0.684346,0.652487
ut,0.21893,1.0,0.661555,0.523838,0.599368,0.481641,0.208865
it,0.644359,0.661555,1.0,0.699401,0.982449,0.755106,0.445558
similarity,0.489355,0.523838,0.699401,1.0,0.696563,0.671261,0.474167
lo,0.775695,0.599368,0.982449,0.696563,1.0,0.790097,0.526863
content,0.684346,0.481641,0.755106,0.671261,0.790097,1.0,0.75138
wording,0.652487,0.208865,0.445558,0.474167,0.526863,0.75138,1.0


In [58]:
X = combine_df[['oot','ut','it','similarity','lo']]
y = combine_df[['content','wording']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=combine_df.input)
X_train.shape, X_test.shape, y_train.shape, y_test.shape   

((5732, 5), (1433, 5), (5732, 2), (1433, 2))

In [59]:
numerical_pipeline = Pipeline([
    ("scaler", RobustScaler())
])
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, X_train.columns),
])
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", MultiOutputRegressor(GradientBoostingRegressor(random_state=42)))
])

In [60]:
parameter = {
    'algo__estimator__max_depth' : range(1,20,2),
    'algo__estimator__max_features' : range(1,20,2),
    'algo__estimator__min_samples_leaf' : range(1,20,2)
}
model = RandomizedSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1, n_iter=50)
model.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [61]:
print(model.score(X_train, y_train), model.score(X_test, y_test), model.best_params_, model.best_score_)

0.7228753745062919 0.6823442439217134 {'algo__estimator__min_samples_leaf': 1, 'algo__estimator__max_features': 9, 'algo__estimator__max_depth': 3} 0.6783160058369884


In [62]:
import pickle

In [63]:
model_path = "../model/model.pkl"

In [64]:
pickle.dump(model.best_estimator_, open(model_path, 'wb'))

In [65]:
model = pickle.load(open(model_path, 'rb'))

In [66]:
y_pred = model.predict(X_test)
y_pred

array([[-0.96262197, -0.79647136],
       [ 1.59599608,  0.61967018],
       [ 2.26775384,  0.94151328],
       ...,
       [ 2.51697993,  2.51455478],
       [ 0.5066772 ,  0.77022693],
       [ 1.69036458,  0.12481981]])

In [67]:
from sklearn.metrics import mean_squared_error

In [68]:
print(mean_squared_error(y_test, y_pred))

0.3456740624784789


In [69]:
df_test_summaries = pd.read_csv("../data/summaries_test.csv", index_col="prompt_id")
df_test_prompt = pd.read_csv("../data/prompts_test.csv", index_col="prompt_id")

In [70]:
combine_test_df = df_test_summaries.join(df_test_prompt, how="outer",lsuffix="prompt_id")
combine_test_df.head()

Unnamed: 0_level_0,student_id,text,prompt_question,prompt_title,prompt_text
prompt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abc123,000000ffffff,Example text 1,Summarize...,Example Title 1,Heading\nText...
abc123,222222cccccc,Example text 3,Summarize...,Example Title 1,Heading\nText...
def789,111111eeeeee,Example text 2,Summarize...,Example Title 2,Heading\nText...
def789,333333dddddd,Example text 4,Summarize...,Example Title 2,Heading\nText...


In [71]:
combine_test_df = getDataset(combine_test_df)
combine_test_df.head()

Unnamed: 0_level_0,text,input,oot,it,ut,lo
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000000ffffff,example text 1,example title 1 summarize... heading\ntext...,0,3,3,3
222222cccccc,example text 3,example title 1 summarize... heading\ntext...,1,2,2,3
111111eeeeee,example text 2,example title 2 summarize... heading\ntext...,0,3,3,3
333333dddddd,example text 4,example title 2 summarize... heading\ntext...,1,2,2,3


In [72]:
combine_test_df['similarity'] = getSimilarity(combine_test_df)
combine_test_df.head()

Unnamed: 0_level_0,text,input,oot,it,ut,lo,similarity
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000000ffffff,example text 1,example title 1 summarize... heading\ntext...,0,3,3,3,0.707107
222222cccccc,example text 3,example title 1 summarize... heading\ntext...,1,2,2,3,0.5
111111eeeeee,example text 2,example title 2 summarize... heading\ntext...,0,3,3,3,0.707107
333333dddddd,example text 4,example title 2 summarize... heading\ntext...,1,2,2,3,0.707107


In [73]:
y_pred = model.predict(combine_test_df[['oot','ut','it','similarity','lo']])
y_pred

array([[-1.75094255, -0.70150278],
       [-1.37444353, -1.41328988],
       [-1.75094255, -0.70150278],
       [-1.36502467, -0.89419612]])

In [74]:
save_csv = pd.DataFrame(y_pred, columns=['content','wording'], index=combine_test_df.index)
save_csv.head()

Unnamed: 0_level_0,content,wording
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000000ffffff,-1.750943,-0.701503
222222cccccc,-1.374444,-1.41329
111111eeeeee,-1.750943,-0.701503
333333dddddd,-1.365025,-0.894196


In [75]:
save_csv.to_csv('../submission/submission.csv')