In [4]:
import pandas as pd
import numpy as np

In [5]:
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# run only if not have that

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
sw = stopwords.words('english') + list(punctuation)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [8]:
df_summaries = pd.read_csv("../data/summaries_train.csv", index_col="prompt_id")
df_prompt = pd.read_csv("../data/prompts_train.csv", index_col="prompt_id")

In [9]:
combine_df = df_summaries.join(df_prompt, how="outer",lsuffix="prompt_id")
combine_df.head()

Unnamed: 0_level_0,student_id,text,content,wording,prompt_question,prompt_title,prompt_text
prompt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
39c16e,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
39c16e,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
39c16e,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
39c16e,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
39c16e,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...


In [10]:
def transformText(texts):
   lowered = [text.lower() for text in texts]
   return lowered

In [11]:
def getCorr(in_text, out_text):
    result = []
    for i, o in zip(in_text, out_text):
        out_token = word_tokenize(o)
        in_token = word_tokenize(i)
        oot = len([word for word in out_token if word not in in_token]) # out of topic
        it = len([word for word in out_token if word in in_token]) # in topic
        ut = len([word for word in in_token if word in out_token]) # used topic
        lo = len(out_token) # length out text
        li = len(in_token) # length in text
        rt = it/lo # rate of in topic / length of text
        result.append([oot, it, ut, lo, li, rt])

    return np.array(result)

In [12]:
def getSimilarity(df):
    input_t = cv.transform(df.input.values)
    output_t = cv.transform(df.text.values)
    data = []
    for i,_ in enumerate(input_t):
        data.append(cosine_similarity(output_t[i],input_t[i])[0])
    return np.array(data)

In [13]:
def getDataset(df):
    df['input'] = df.prompt_title + " " + df.prompt_question + " " +df.prompt_text
    df.index = df.student_id
    df.drop(columns=['prompt_title','prompt_question','prompt_text','student_id'], inplace=True)
    df['text'] = transformText(df.text.values)
    df['input'] = transformText(df.input.values)
    df[['oot','it','ut','lo','li','rt']] = getCorr(df.input.values, df.text.values)
    return df


In [14]:
combine_df = getDataset(combine_df)

In [15]:
cv = CountVectorizer(stop_words=sw, tokenizer=word_tokenize, ngram_range=(1,3))

In [16]:
cv.fit(combine_df.input)



In [17]:
combine_df['similarity'] = getSimilarity(combine_df)
combine_df.head()

Unnamed: 0_level_0,text,content,wording,input,oot,it,ut,lo,li,rt,similarity
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,on tragedy summarize at least 3 elements of an...,9.0,50.0,226.0,59.0,716.0,0.847458,0.242284
0086ef22de8f,the three elements of an ideal tragedy are: h...,-0.970237,-0.417058,on tragedy summarize at least 3 elements of an...,8.0,22.0,253.0,30.0,716.0,0.733333,0.184412
0094589c7a22,aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,on tragedy summarize at least 3 elements of an...,28.0,46.0,283.0,74.0,716.0,0.621622,0.198812
00cd5736026a,one element of an ideal tragedy is having a co...,0.088882,-0.59471,on tragedy summarize at least 3 elements of an...,14.0,47.0,244.0,61.0,716.0,0.770492,0.299302
00d98b8ff756,the 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,on tragedy summarize at least 3 elements of an...,23.0,40.0,201.0,63.0,716.0,0.634921,0.160035


In [18]:
X = combine_df[['oot','ut','it','similarity','lo','li','rt']]
y = combine_df[['content','wording']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=combine_df.input)
X_train.shape, X_test.shape, y_train.shape, y_test.shape   

((5732, 7), (1433, 7), (5732, 2), (1433, 2))

In [19]:
numerical_pipeline = Pipeline([
    ("scaler", StandardScaler())
])
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, X_train.columns),
])
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", RandomForestRegressor())
])

In [20]:
parameter = {
    'algo__max_depth' : range(1,50,2),
    'algo__max_features' : range(1,50,2),
    'algo__min_samples_leaf' : range(1,50,2)
}
model = RandomizedSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [21]:
print(model.score(X_train, y_train), model.score(X_test, y_test), model.best_params_, model.best_score_)

0.759826576682522 0.6954064631353535 {'algo__min_samples_leaf': 15, 'algo__max_features': 5, 'algo__max_depth': 23} 0.7014541456866811


In [22]:
import pickle

In [24]:
model_path = "../model/model.pkl"

In [26]:
pickle.dump(model.best_estimator_, open(model_path, 'wb'))

In [27]:
model = pickle.load(open(model_path, 'rb'))

In [28]:
y_pred = model.predict(X_test)
y_pred

array([[-0.86515784, -0.82672144],
       [ 1.51007204,  0.83205668],
       [ 2.21231914,  0.81687144],
       ...,
       [ 2.70530055,  2.51324033],
       [ 0.14463631,  0.45239632],
       [ 1.3992292 ,  0.04279952]])

In [29]:
from sklearn.metrics import mean_squared_error

In [30]:
print(mean_squared_error(y_test, y_pred))

0.3315917031574748


In [31]:
df_test_summaries = pd.read_csv("../data/summaries_test.csv", index_col="prompt_id")
df_test_prompt = pd.read_csv("../data/prompts_test.csv", index_col="prompt_id")

In [32]:
combine_test_df = df_test_summaries.join(df_test_prompt, how="outer",lsuffix="prompt_id")
combine_test_df.head()

Unnamed: 0_level_0,student_id,text,prompt_question,prompt_title,prompt_text
prompt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abc123,000000ffffff,Example text 1,Summarize...,Example Title 1,Heading\nText...
abc123,222222cccccc,Example text 3,Summarize...,Example Title 1,Heading\nText...
def789,111111eeeeee,Example text 2,Summarize...,Example Title 2,Heading\nText...
def789,333333dddddd,Example text 4,Summarize...,Example Title 2,Heading\nText...


In [33]:
combine_test_df = getDataset(combine_test_df)
combine_test_df.head()

Unnamed: 0_level_0,text,input,oot,it,ut,lo,li,rt
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
000000ffffff,example text 1,example title 1 summarize... heading\ntext...,0.0,3.0,3.0,3.0,8.0,1.0
222222cccccc,example text 3,example title 1 summarize... heading\ntext...,1.0,2.0,2.0,3.0,8.0,0.666667
111111eeeeee,example text 2,example title 2 summarize... heading\ntext...,0.0,3.0,3.0,3.0,8.0,1.0
333333dddddd,example text 4,example title 2 summarize... heading\ntext...,1.0,2.0,2.0,3.0,8.0,0.666667


In [34]:
combine_test_df['similarity'] = getSimilarity(combine_test_df)
combine_test_df.head()

Unnamed: 0_level_0,text,input,oot,it,ut,lo,li,rt,similarity
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
000000ffffff,example text 1,example title 1 summarize... heading\ntext...,0.0,3.0,3.0,3.0,8.0,1.0,0.707107
222222cccccc,example text 3,example title 1 summarize... heading\ntext...,1.0,2.0,2.0,3.0,8.0,0.666667,0.5
111111eeeeee,example text 2,example title 2 summarize... heading\ntext...,0.0,3.0,3.0,3.0,8.0,1.0,0.707107
333333dddddd,example text 4,example title 2 summarize... heading\ntext...,1.0,2.0,2.0,3.0,8.0,0.666667,0.707107


In [35]:
y_pred = model.predict(combine_test_df[['oot','ut','it','similarity','lo','li','rt']])
y_pred

array([[-1.27491205, -1.13601005],
       [-1.19827974, -1.02250336],
       [-1.27491205, -1.13601005],
       [-1.19827974, -1.02250336]])

In [36]:
save_csv = pd.DataFrame(y_pred, columns=['content','wording'], index=combine_test_df.index)
save_csv.head()

Unnamed: 0_level_0,content,wording
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000000ffffff,-1.274912,-1.13601
222222cccccc,-1.19828,-1.022503
111111eeeeee,-1.274912,-1.13601
333333dddddd,-1.19828,-1.022503


In [37]:
save_csv.to_csv('../submission//submission.csv')