In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel, BigBirdTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from tqdm import tqdm
import textstat
import torch
import numpy as np
import catboost

In [2]:
prompts_train = pd.read_csv('./data/prompts_train.csv')
summaries_train = pd.read_csv('./data/summaries_train.csv')
prompts_test = pd.read_csv('./data/prompts_test.csv')
summaries_test = pd.read_csv('./data/summaries_test.csv')

In [3]:
summaries_train['word_counts'] = summaries_train['text'].apply(lambda x : len(set(x.lower().split(' '))))
summaries_train['smog_index'] = summaries_train['text'].apply(lambda x : textstat.smog_index(x))
prompts_train['prompt_text'] = prompts_train['prompt_text'].str.replace('\r','')
prompts_train['prompt_text'] = prompts_train['prompt_text'].str.replace('\n','')

In [4]:
summaries_test['word_counts'] = summaries_test['text'].apply(lambda x : len(set(x.lower().split(' '))))
summaries_test['smog_index'] = summaries_test['text'].apply(lambda x : textstat.smog_index(x))
prompts_test['prompt_text'] = prompts_test['prompt_text'].str.replace('\r','')
prompts_test['prompt_text'] = prompts_test['prompt_text'].str.replace('\n','')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')
embedding_model = AutoModel.from_pretrained('allenai/longformer-base-4096')
embedding_model.cuda()
embedding_model.eval()

In [6]:
def prompt_embedding(embedding_model, prompt):
    encoded_prompt = tokenizer(prompt, max_length=2048, return_tensors = 'pt')
    encoded_prompt = {i: v.to("cuda") for i, v in encoded_prompt.items()}
    with torch.no_grad():
        prompt_output = embedding_model(**encoded_prompt)
    embedding = prompt_output.pooler_output
    return embedding

In [7]:
embeddings = []
for prompt in prompts_train['prompt_text']:
    embedding = prompt_embedding(embedding_model,prompt)
    embeddings += embedding.detach().cpu().tolist()
prompts_train['embeded_prompt_text'] = embeddings

embeddings = []
for prompt in prompts_test['prompt_text']:
    embedding = prompt_embedding(embedding_model,prompt)
    embeddings += embedding.detach().cpu().tolist()
prompts_test['embeded_prompt_text'] = embeddings

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [8]:
def filter_(text,prompt):
    count = 0
    for num in text:
        if num in prompt:
            count += 1
    return count

In [9]:
encoded_prompts = []
for prompt_text in prompts_train['prompt_text']:
    encoded_prompts.append(tokenizer.encode(prompt_text))
prompts_train['encoded_prompt'] = encoded_prompts

counts_duplicate = []
own_unique_words = []
for i in range(len(summaries_train)):
    student_text = summaries_train.iloc[i]['text']
    prompt_id = summaries_train.iloc[i]['prompt_id']
    encoded_prompt = prompts_train[prompts_train['prompt_id'] == prompt_id]['encoded_prompt'].tolist()[0]
    encoded_text = tokenizer.encode(student_text)
    count_duplicate = len(set(encoded_text) & set(encoded_prompt))
    counts_duplicate.append(count_duplicate)
    own_unique_words.append(len(set(encoded_text) - set(encoded_prompt)))
summaries_train['counts_duplicate'] = counts_duplicate
summaries_train['own_unique_words'] = own_unique_words

In [10]:
encoded_prompts = []
for prompt_text in prompts_test['prompt_text']:
    encoded_prompts.append(tokenizer.encode(prompt_text))
prompts_test['encoded_prompt'] = encoded_prompts

counts_duplicate = []
own_unique_words = []
for i in range(len(summaries_test)):
    student_text = summaries_test.iloc[i]['text']
    prompt_id = summaries_test.iloc[i]['prompt_id']
    encoded_prompt = prompts_test[prompts_test['prompt_id'] == prompt_id]['encoded_prompt'].tolist()[0]
    encoded_text = tokenizer.encode(student_text)
    count_duplicate = len(set(encoded_text) & set(encoded_prompt))
    counts_duplicate.append(count_duplicate)
    own_unique_words.append(len(set(encoded_text) - set(encoded_prompt)))
summaries_test['counts_duplicate'] = counts_duplicate
summaries_test['own_unique_words'] = own_unique_words

In [11]:
cos_sim = []
for i in tqdm(range(len(summaries_train))):
    text_1 = summaries_train['text'][i]
    prompt_id = summaries_train['prompt_id'][i]
    prompt_output = prompts_train[prompts_train['prompt_id'] == prompt_id]['embeded_prompt_text']
    input_text = tokenizer(text_1,max_length=1024,return_tensors='pt')
    input_text = {i: v.to("cuda") for i, v in input_text.items()}
    with torch.no_grad():
        text_output = embedding_model(**input_text)
    text_output = text_output.pooler_output
    cos_sim.append(cosine_similarity(text_output[0].detach().cpu().numpy().reshape(1,-1),np.array(prompt_output.values[0]).reshape(1,-1)))
scaler =  RobustScaler()  #MinMaxScaler
summaries_train['cos_sim'] = np.array(cos_sim).flatten()
summaries_train[['word_counts','smog_index','cos_sim','counts_duplicate', 'own_unique_words']] = scaler.fit_transform(summaries_train[['word_counts','smog_index','cos_sim','counts_duplicate', 'own_unique_words']])

100%|██████████| 7165/7165 [04:16<00:00, 27.94it/s]


In [12]:
cos_sim_test = []
for i in tqdm(range(len(summaries_test))):
    text_1 = summaries_test['text'][i]
    prompt_id = summaries_test['prompt_id'][i]
    prompt_output = prompts_test[prompts_test['prompt_id'] == prompt_id]['embeded_prompt_text']
    input_text = tokenizer(text_1,max_length=1024,return_tensors='pt')
    input_text = {i: v.to("cuda") for i, v in input_text.items()}
    with torch.no_grad():
        text_output = embedding_model(**input_text)
    text_output = text_output.pooler_output
    cos_sim_test.append(cosine_similarity(text_output[0].detach().cpu().numpy().reshape(1,-1),np.array(prompt_output.values[0]).reshape(1,-1)))
scaler = RobustScaler()
summaries_test['cos_sim'] = np.array(cos_sim_test).flatten()
summaries_test[['word_counts','smog_index','cos_sim','counts_duplicate', 'own_unique_words']] = scaler.fit_transform(summaries_test[['word_counts','smog_index','cos_sim','counts_duplicate', 'own_unique_words']])

100%|██████████| 4/4 [00:00<00:00, 26.50it/s]


In [63]:
X = summaries_train[['word_counts','smog_index','counts_duplicate','cos_sim']]  # ,'own_unique_words'
y_content = summaries_train[['content']]
y_wording = summaries_train[['wording']]
test_data = summaries_test[['word_counts','smog_index','counts_duplicate','cos_sim']] # 'cos_sim','own_unique_words'

In [None]:
import matplotlib.pyplot as plt
plt.scatter(X['own_unique_words'], y_wording)
plt.show()

In [56]:
import scipy.stats as stats
stats.pearsonr(X['own_unique_words'], y_wording)

PearsonRResult(statistic=array([0.6529413299422794], dtype=object), pvalue=0.0)

In [65]:
train_x,valid_x,train_y,valid_y = train_test_split(X,y_content,test_size=0.1,random_state=42)

In [66]:
model = catboost.CatBoostRegressor(random_state=42, max_depth=5,learning_rate=0.05,objective = 'RMSE',verbose = 0)
model.fit(train_x,train_y,eval_set=(valid_x,valid_y))
print(model.best_score_)
pred_content = model.predict(test_data)

{'learn': {'RMSE': 0.4041888363776972}, 'validation': {'RMSE': 0.4663679936477498}}


In [67]:
train_x,valid_x,train_y,valid_y = train_test_split(X,y_wording,test_size=0.1,random_state=42)

In [68]:
model = catboost.CatBoostRegressor(random_state=42, max_depth=5,learning_rate=0.05,objective = 'RMSE',verbose = 0)
model.fit(train_x,train_y,eval_set=(valid_x,valid_y))
print(model.best_score_)
pred_wording = model.predict(test_data)

{'learn': {'RMSE': 0.5436268667686762}, 'validation': {'RMSE': 0.6161636410760295}}


In [35]:
submission = pd.read_csv('./data/sample_submission.csv')

In [49]:
submission['content'] = pred_content
submission['wording'] = pred_wording

In [50]:
submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.056918,-0.518662
1,111111eeeeee,-1.129924,-0.675647
2,222222cccccc,-1.229725,-0.836996
3,333333dddddd,-1.155395,-0.693272
