In [1]:
!pip install /kaggle/input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl
!pip install /kaggle/input/textstat-pypi/textstat-0.7.0-py3-none-any.whl
!pip install /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl

Processing /kaggle/input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl
Installing collected packages: Pyphen
Successfully installed Pyphen-0.9.3
Processing /kaggle/input/textstat-pypi/textstat-0.7.0-py3-none-any.whl
Installing collected packages: textstat
Successfully installed textstat-0.7.0
Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [2]:
import re
from sklearn import metrics
from sklearn.model_selection import train_test_split
import textstat
import catboost
import lightgbm
import spacy
import nltk
from spellchecker import SpellChecker

import os
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import torch
from sklearn.model_selection import StratifiedKFold
import time
import gc
import random
import warnings
warnings.filterwarnings("ignore")
torch.backends.cudnn.benchmark=False
torch.backends.cudnn.deterministic=True

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
class cfg:
    select = 'base'
    model_name = f'/kaggle/input/deberta-v3-{select}/deberta-v3-{select}'
    only_model_name = f'deberta-v3-{select}'
    accum_iter = 4
    fold = 4
    seed = 42
    batch_size = 8
    max_len = 512
    num_epoch = 5
    hidden_dropout_prob=0.005
    attention_probs_dropout_prob=0.005
    lr = 5e-5

def seed_everything(seed: int):
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    
seed_everything(seed=cfg.seed)

In [5]:
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]#.detach().to('cpu').numpy()
        y_pred = y_preds[:,i]#.detach().to('cpu').numpy()
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def score_loss(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return {
        'mcrmse_score' : mcrmse_score,
        'Content_score' : scores[0],
        'Wording_score' : scores[1]
    }

In [6]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
cfg.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
prompts_train = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
summaries_train = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')

prompts_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
summaries_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

In [8]:
summaries_train['smog_index'] = summaries_train['text'].apply(lambda x : textstat.smog_index(x))
summaries_test['smog_index'] = summaries_test['text'].apply(lambda x : textstat.smog_index(x))

prompts_train['prompt_text'] = prompts_train['prompt_text'].apply(lambda x : x.lower())
prompts_test['prompt_text'] = prompts_test['prompt_text'].apply(lambda x : x.lower())
summaries_train['text'] = summaries_train['text'].apply(lambda x : x.lower())
summaries_test['text'] = summaries_test['text'].apply(lambda x : x.lower())

In [9]:
summaries_train['word_counts'] = summaries_train['text'].apply(lambda x : len(set(x.split(' '))))
summaries_test['word_counts'] = summaries_test['text'].apply(lambda x : len(set(x.split(' '))))

prompts_train['prompt_text'] = prompts_train['prompt_text'].str.replace('\r','')
prompts_train['prompt_text'] = prompts_train['prompt_text'].str.replace('\n','')

In [10]:
summaries_train['sentence_counts'] = summaries_train['text'].apply(lambda x : len(set(x.split('.'))))
summaries_test['sentence_counts'] = summaries_test['text'].apply(lambda x : len(set(x.split('.'))))
summaries_train["syntax_count"] = summaries_train['text'].apply(lambda x: x.count(",") + x.count("-") + x.count(";") + x.count(":"))
summaries_test["syntax_count"] = summaries_test['text'].apply(lambda x: x.count(",") + x.count("-") + x.count(";") + x.count(":"))

In [11]:
text_no_stopwords = []
stop_words = set(nltk.corpus.stopwords.words('english'))
stop_words.update(['.',',','”','“',';',':','?',"''","``"])
def prompt_no_stopwords(df):
    text_no_stopwords = []
    for text in df['prompt_text']:
        text_no_stopword = [token for token in nltk.word_tokenize(text) if token not in stop_words]
        text_no_stopwords.append(text_no_stopword)
    df['prompt_no_stop_words'] = text_no_stopwords
    return df
def summaries_no_stopwords(df):
    text_no_stopwords = []
    for text in df['text']:
        text_no_stopword = [token for token in nltk.word_tokenize(text) if token not in stop_words]
        text_no_stopwords.append(text_no_stopword)
    df['summaries_no_stop_words'] = text_no_stopwords
    return df

In [12]:
prompts_train = prompt_no_stopwords(prompts_train)
prompts_test = prompt_no_stopwords(prompts_test)
summaries_train = summaries_no_stopwords(summaries_train)
summaries_test = summaries_no_stopwords(summaries_test)

In [13]:
summaries_train['word_counts_with_stopword'] = summaries_train['summaries_no_stop_words'].apply(lambda x : len(set(x)))
summaries_test['word_counts_with_stopword'] = summaries_test['summaries_no_stop_words'].apply(lambda x : len(set(x)))

In [14]:
spell = SpellChecker()
def summaries_spell_check(df):
    total_mis_tokens = []
    for text in df['text']:
        tokens = nltk.word_tokenize(text)
        mis_tokens = [token for token in spell.unknown(tokens) if token.isalpha()]
        total_mis_tokens.append(mis_tokens)
    df['mis_tokens'] = total_mis_tokens
    return df
summaries_train = summaries_spell_check(summaries_train)
summaries_test = summaries_spell_check(summaries_test)

In [15]:
summaries_train['counts_mis_token'] = summaries_train['mis_tokens'].apply(lambda x : len(set(x)))
summaries_test['counts_mis_token'] = summaries_test['mis_tokens'].apply(lambda x : len(set(x)))

In [16]:
counts_duplicate = []
own_unique_words = []

for i in range(len(summaries_train)):
    student_text = summaries_train.iloc[i]['summaries_no_stop_words']
    prompt_id = summaries_train.iloc[i]['prompt_id']
    splited_prompt = prompts_train[prompts_train['prompt_id'] == prompt_id]['prompt_no_stop_words'].tolist()[0]
    count_duplicate = len(set(student_text) & set(splited_prompt))
    counts_duplicate.append(count_duplicate)
    own_unique_words.append(len(set(student_text) - set(splited_prompt)))
summaries_train['counts_duplicate'] = counts_duplicate
summaries_train['own_unique_words'] = own_unique_words

In [17]:
counts_duplicate = []
own_unique_words = []

for i in range(len(summaries_test)):
    student_text = summaries_test.iloc[i]['summaries_no_stop_words']
    prompt_id = summaries_test.iloc[i]['prompt_id']
    splited_prompt = prompts_test[prompts_test['prompt_id'] == prompt_id]['prompt_no_stop_words'].tolist()[0]
    count_duplicate = len(set(student_text) & set(splited_prompt))
    counts_duplicate.append(count_duplicate)
    own_unique_words.append(len(set(student_text) - set(splited_prompt)))
summaries_test['counts_duplicate'] = counts_duplicate
summaries_test['own_unique_words'] = own_unique_words

In [18]:
train = summaries_train.merge(prompts_train, how='left', on="prompt_id")
test = summaries_test.merge(prompts_test, how='left', on="prompt_id")
prompt_dict = {id : index for index,id in enumerate(train['prompt_id'].unique())}
train['fold'] = -1
train['fold'] = train['prompt_id'].apply(lambda x : prompt_dict[x])

In [19]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = cfg.tokenizer
        self.max_len = cfg.max_len
        self.fp = df['prompt_text'].values
        self.pq = df['prompt_question'].values
        self.title = df['prompt_title'].values
        self.text = df['text'].values
        self.targets = df[['content','wording']].values
        self.id = df['student_id'].values
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self , index):
        pq   =   self.pq[index]
        title = self.title[index]
        text =   self.text[index]
        fp = self.fp[index]
        full_text = pq + self.tokenizer.sep_token + text
        
        inputs = self.tokenizer.encode_plus(
                        full_text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        target = self.targets[index]
        
   
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            
        } , torch.tensor(target, dtype=torch.float), self.id[index]

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

In [20]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = cfg.tokenizer
        self.max_len = cfg.max_len
        self.fp = df['prompt_text'].values
        self.pq = df['prompt_question'].values
        self.title = df['prompt_title'].values
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self , index):
        pq   =   self.pq[index]
        title = self.title[index]
        text =   self.text[index]
        fp = self.fp[index]
        full_text = pq + self.tokenizer.sep_token + text
        
        inputs = self.tokenizer.encode_plus(
                        full_text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
   
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
        }

In [21]:
def validation(model, val_loader):
    model.eval()
    val_loss = []
    pred_list = []
    label_list = []
    id_list = []
    with torch.no_grad():
        for batch, labels, s_id in tqdm(iter(val_loader)):
            batch = {i: v.to("cuda") for i, v in batch.items()}
            labels = labels.cuda()

            output = model(**batch,labels = labels)
            pred = output.logits
            loss = output.loss
            
            val_loss.append(loss.item())
            pred_list += pred.detach().cpu().tolist()
            label_list += labels.detach().cpu().tolist()
            id_list += s_id
        _val_loss = np.mean(val_loss)
        pred_list_re = np.array(pred_list).reshape(-1,2)[:,]
        label_list_re = np.array(label_list).reshape(-1,2)[:,]
        scores = compute_mcrmse(pred_list_re, label_list_re)

    return _val_loss, scores, pred_list_re, label_list_re, id_list

In [22]:
def predict(model, test_loader):
    model.eval()
    pred_list = []
    with torch.no_grad():
        for batch in tqdm(iter(test_loader)):
            batch = {i: v.to("cuda") for i, v in batch.items()}
            output = model(**batch)
            pred = output.logits
            
            pred_list += pred.detach().cpu().tolist()
        pred_list_re = np.array(pred_list).reshape(-1,2)[:,]

    return pred_list_re,pred_list

In [23]:
def data_fold(df, n_fold):
    dftrain = df[df['fold']!= n_fold]
    dfvalid = df[df['fold']== n_fold]
    
    train_dataset = CustomDataset(dftrain)
    valid_dataset = CustomDataset(dfvalid)
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.batch_size, num_workers=2, shuffle=True, pin_memory=True) 
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=cfg.batch_size, num_workers=2, shuffle=False, pin_memory=True) 
    
    return train_loader , valid_loader

In [24]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(preds, labels):
    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

In [25]:
# total_pred = []
# total_id = []
# for n_fold in range(0,4):
#     print('******** fold' , n_fold , '********')
#     model_config = AutoConfig.from_pretrained(cfg.model_name)
#     model_config.update({
#             "hidden_dropout_prob": cfg.hidden_dropout_prob,
#             "attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
#             "num_labels": 2,
#             "problem_type": "regression"
#         })
#     model = AutoModelForSequenceClassification.from_pretrained(cfg.model_name,config = model_config)
#     model.load_state_dict(torch.load(f"/kaggle/input/deberta-v3-base-fold/deberta-v3-base-Fold_{n_fold}.pth", map_location=torch.device('cpu')))
#     model.cuda()
#     train_loader , valid_loader = data_fold(train, n_fold)
#     _val_loss, scores, valid_pred, valid_label, id_list = validation(model, valid_loader)
#     total_pred += valid_pred.tolist()
#     total_id += id_list
#     del model, model_config

# train_pred_df = pd.DataFrame(data = zip(total_id, np.array(total_pred)[:,0].tolist(), np.array(total_pred)[:,1].tolist()) ,columns = ['student_id', 'content_pred', 'wording_pred'])
# train = train.merge(train_pred_df, how = 'left', on = 'student_id')

In [26]:
test_dataset = TestDataset(test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = cfg.batch_size, num_workers=0, shuffle=False, pin_memory = True)

In [27]:
total_content_pred = np.zeros(shape = (len(test)))
total_wording_pred = np.zeros(shape = (len(test)))
for n_fold in range(0,4):
    print('******** fold' , n_fold , '********')
    model_config = AutoConfig.from_pretrained(cfg.model_name)
    model_config.update({
            "hidden_dropout_prob": cfg.hidden_dropout_prob,
            "attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
            "num_labels": 2,
            "problem_type": "regression"
        })
    model = AutoModelForSequenceClassification.from_pretrained(cfg.model_name,config = model_config)
    model.load_state_dict(torch.load(f"/kaggle/input/deberta-v3-base-fold/deberta-v3-base-Fold_{n_fold}.pth", map_location=torch.device('cpu')))
    model.cuda()
    pred_list_re,pred_list = predict(model, test_loader)
    total_content_pred += pred_list_re[:, 0]
    total_wording_pred += pred_list_re[:, 1]

    del model, model_config

******** fold 0 ********


Some weights of the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifer.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifer.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassific

  0%|          | 0/1 [00:00<?, ?it/s]

******** fold 1 ********


Some weights of the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifer.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifer.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassific

  0%|          | 0/1 [00:00<?, ?it/s]

******** fold 2 ********


Some weights of the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifer.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifer.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassific

  0%|          | 0/1 [00:00<?, ?it/s]

******** fold 3 ********


Some weights of the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifer.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifer.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassific

  0%|          | 0/1 [00:00<?, ?it/s]

In [28]:
test['content_pred'] = total_content_pred/4
test['wording_pred'] = total_wording_pred/4

In [29]:
# from itertools import combinations
# all_columns = ['smog_index', 'word_counts', 'sentence_counts', 'syntax_count', 'word_counts_with_stopword', 'counts_mis_token', 'counts_duplicate', 'content_pred', 'wording_pred']
# column_list = []
# for n in range(4, len(all_columns) + 1):
#     for column in combinations(all_columns, n):
#         column_list.append(list(column))

In [30]:
# # lgbm
# # content
# best_score = 1
# best_column = []
# for column in tqdm(column_list):
#     X = train[column]
#     y = train['content']
#     test_x = test[column]
#     train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state=42, shuffle= False)
    
#     model = lightgbm.LGBMRegressor(random_state=42, max_depth=5,learning_rate=0.05)
#     model.fit(train_x,train_y, eval_set = (valid_x, valid_y), eval_metric = 'rmse', verbose=-1)

#     score = model.best_score_['valid_0']['rmse']
#     if best_score > score:
#         best_score = score
#         best_column = column
#         lgbm_content_pred = model.predict(test_x)
# print(best_score, best_column)

In [31]:
# # lgbm
# # wording
# best_score = 1
# best_column = []
# for column in tqdm(column_list):
#     X = train[column]
#     y = train['wording']
#     test_x = test[column]

#     train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state=42, shuffle= False)
    
#     model = lightgbm.LGBMRegressor(random_state=42, max_depth=5,learning_rate=0.05)
#     model.fit(train_x,train_y, eval_set = (valid_x, valid_y), eval_metric = 'rmse', verbose=-1)

#     score = model.best_score_['valid_0']['rmse']
#     if best_score > score:
#         best_score = score
#         best_column = column
#         lgbm_wording_pred = model.predict(test_x)
# print(best_score, best_column)

In [32]:
# # Catboost
# # content
# best_score = 1
# best_column = []
# for column in tqdm(column_list):
#     X = train[column]
#     y = train['content']
#     test_x = test[column]
#     train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state=42, shuffle= False)
    
#     model = catboost.CatBoostRegressor(random_state=42, max_depth=5,learning_rate=0.05,objective = 'RMSE',verbose=False)
#     model.fit(train_x,train_y,eval_set = (valid_x,valid_y))
#     score = model.best_score_['validation']['RMSE']
#     if best_score > score:
#         best_score = score
#         best_column = column
#         cat_content_pred = model.predict(test_x)
# print(best_score, best_column)

In [33]:
# # Catboost
# # wording
# best_score = 1
# best_column = []
# for column in tqdm(column_list):
#     X = train[column]
#     y = train['wording']
#     test_x = test[column]
#     train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state=42, shuffle= False)
    
#     model = catboost.CatBoostRegressor(random_state=42, max_depth=5,learning_rate=0.05,objective = 'RMSE',verbose=False)
#     model.fit(train_x,train_y,eval_set = (valid_x,valid_y))
#     score = model.best_score_['validation']['RMSE']
#     if best_score > score:
#         best_score = score
#         best_column = column
#         cat_wording_pred = model.predict(test_x)
# print(best_score, best_column)

In [34]:
sub = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv')
sub['content'] = total_content_pred/4
sub['wording'] = total_wording_pred/4

In [35]:
sub.to_csv('submission.csv',index=False)