## 

# Load model

In [1]:
import torch
import transformers
import numpy
from transformers import AutoTokenizer, AutoModel
device = torch.device('cuda')
tokenizer = AutoTokenizer.from_pretrained('../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer')
model = AutoModel.from_pretrained('../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer')


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def get_embedding(sentences):
    #text = TOKENIZER(summaries, padding='max_length', truncation=True, return_tensors="pt")
    #vector = text['input_ids'][0]
    
    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling. In this case, max pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings

# Load data

In [3]:
import pandas as pd

prompts_df = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
summaries_df = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")
combined_train_df = pd.merge(summaries_df, prompts_df, how = 'left', on = 'prompt_id')

In [4]:
combined_train_df.columns

Index(['student_id', 'prompt_id', 'text', 'content', 'wording',
       'prompt_question', 'prompt_title', 'prompt_text'],
      dtype='object')

# Calculate metrics

In [5]:
groups = combined_train_df.groupby(['prompt_id'])
prompt_ids = combined_train_df['prompt_id'].unique()
prompt_texts = combined_train_df['prompt_text'].unique()
prompt_embeds = []

for i in range(len(prompt_texts)):
    prompt_embeds.append(get_embedding(prompt_texts[i]))

In [6]:
prompt_embeds_dict = dict(zip(prompt_ids, prompt_embeds))

In [7]:
prompt_embeds_dict.keys()

dict_keys(['814d6b', 'ebad26', '3b9047', '39c16e'])

### Let's calculate cosine similarity for each pair (prompt_text, text)

In [8]:
import sklearn.metrics

cosine_simillarity_scores = []

for idx in range(len(combined_train_df)):
    print(idx)
    prompt_id = combined_train_df.iloc[idx]['prompt_id']
    prompt_embed = prompt_embeds_dict[prompt_id]
    summary = combined_train_df.iloc[idx]['text']
    summary_embed = get_embedding(summary)
    
    metric = sklearn.metrics.pairwise.cosine_similarity(prompt_embed, Y=summary_embed, dense_output=True)
    cosine_simillarity_scores.append(metric)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

### Let's calculate lexicon diversity

In [9]:
from nltk.corpus import stopwords
import string

stop_words = set(stopwords.words('english'))

def calculate_lexical_diversity_metric(text):
  # remove puctuation and lower case
  preprocessed_text = text.lower().translate(str.maketrans('', '', string.punctuation))
  
  # remove stop words
  text_words_set = set(preprocessed_text.split())
  valuable_text_words_set = text_words_set.difference(stop_words)

  return len(valuable_text_words_set)

In [10]:
diversity_scores = []

for idx in range(len(combined_train_df)):
    summary = combined_train_df.iloc[idx]['text']
    metric = calculate_lexical_diversity_metric(summary)
    diversity_scores.append(metric)

### Let's build the datasets

In [11]:
combined_train_df['lexicon_diversity'] = diversity_scores
combined_train_df['cosine_similarity'] = cosine_simillarity_scores

In [12]:
tabular_data_content = pd.DataFrame(columns=['cosine_similarity', 'content'], data=combined_train_df[['cosine_similarity', 'content']])
tabular_data_wording = pd.DataFrame(columns=['lexicon_diversity', 'wording'], data=combined_train_df[['lexicon_diversity', 'wording']])

#### Let's build the model for wording

In [13]:
from sklearn.preprocessing import StandardScaler

def scale_data(data):
  scaler = StandardScaler()
  scaled_data = scaler.fit_transform(data)
  return data

In [14]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

def train_predict(dataset, target_column_name):
    # split data    
    X = dataset.drop(columns=[target_column_name])
    y = dataset[target_column_name]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    # scale data
    X_train_scaled = scale_data(X_train)
    X_test_scaled = scale_data(X_test)
    
     
    # init catboost
    model = CatBoostRegressor(loss_function='RMSE')
    # fit model
    model.fit(X_train_scaled, y_train)
    # Get predictions
    preds = model.predict(X_test_scaled)
    
    return y_test, preds, model

### Let's train the model

In [15]:
tabular_data_content.columns

Index(['cosine_similarity', 'content'], dtype='object')

In [16]:
y_test_content, preds_content, model_content = train_predict(tabular_data_content, 'content')
y_test_wording, preds_wording, model_wording = train_predict(tabular_data_wording, 'wording')

Learning rate set to 0.052458
0:	learn: 1.0167988	total: 56.1ms	remaining: 56.1s
1:	learn: 0.9983674	total: 57.3ms	remaining: 28.6s
2:	learn: 0.9810752	total: 58.3ms	remaining: 19.4s
3:	learn: 0.9653566	total: 59.4ms	remaining: 14.8s
4:	learn: 0.9508508	total: 60.4ms	remaining: 12s
5:	learn: 0.9378584	total: 61.4ms	remaining: 10.2s
6:	learn: 0.9252759	total: 62.4ms	remaining: 8.86s
7:	learn: 0.9145735	total: 63.8ms	remaining: 7.91s
8:	learn: 0.9044414	total: 64.8ms	remaining: 7.13s
9:	learn: 0.8946751	total: 66ms	remaining: 6.53s
10:	learn: 0.8858985	total: 67.3ms	remaining: 6.05s
11:	learn: 0.8785200	total: 68.7ms	remaining: 5.65s
12:	learn: 0.8713216	total: 69.8ms	remaining: 5.3s
13:	learn: 0.8647030	total: 71.1ms	remaining: 5.01s
14:	learn: 0.8585319	total: 72.5ms	remaining: 4.76s
15:	learn: 0.8529533	total: 73.9ms	remaining: 4.54s
16:	learn: 0.8481250	total: 75ms	remaining: 4.33s
17:	learn: 0.8435267	total: 75.9ms	remaining: 4.14s
18:	learn: 0.8394968	total: 76.9ms	remaining: 3.97s

### Let's evaluate the result

In [17]:
import numpy as np
from sklearn.metrics import mean_squared_error

def MCRMSE(y_trues_content, y_trues_wording, y_preds_content, y_preds_wording):   
    
    content_score = mean_squared_error(y_trues_content, y_preds_content, squared=False) # RMSE
    wording_score = mean_squared_error(y_trues_wording, y_preds_wording, squared=False) # RMSE
        
    mcrmse_score = np.mean([content_score, wording_score])
    return mcrmse_score, [content_score, wording_score]


def score_loss(y_trues_content, y_trues_wording, y_preds_content, y_preds_wording):
    mcrmse_score, scores = MCRMSE(y_trues_content, y_trues_wording, y_preds_content, y_preds_wording)
    return {
        'mcrmse_score' : mcrmse_score,
        'Content_score' : scores[0],
        'Wording_score' : scores[1]
    }

In [18]:
score_loss(y_test_content, y_test_wording, preds_content, preds_wording)

{'mcrmse_score': 0.844863312100602,
 'Content_score': 0.8241878400503505,
 'Wording_score': 0.8655387841508535}

### Let's do the same for the test data

In [45]:
prompts_test_df = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")
summaries_test_df = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")
combined_test_df = pd.merge(summaries_test_df, prompts_test_df, how = 'left', on = 'prompt_id')

In [46]:
def build_test_datasets(df):
    # embed prompts
    groups = df.groupby(['prompt_id'])
    prompt_ids = df['prompt_id'].unique()
    prompt_texts = df['prompt_text']
    prompt_embeds = []

    for i in range(len(prompt_texts)):
        prompt_embeds.append(get_embedding(prompt_texts[i]))
        
    prompt_embeds_dict = dict(zip(prompt_ids, prompt_embeds))
    
    # calculate cosine similarity metrics
    cosine_simillarity_scores = []

    for idx in range(len(df)):
        print(idx)
        prompt_id = df.iloc[idx]['prompt_id']
        prompt_embed = prompt_embeds_dict[prompt_id]
        summary = df.iloc[idx]['text']
        summary_embed = get_embedding(summary)

        metric = sklearn.metrics.pairwise.cosine_similarity(prompt_embed, Y=summary_embed, dense_output=True)
        cosine_simillarity_scores.append(metric)
        
    # calculate lexicon diversity metrics
    diversity_scores = []

    for idx in range(len(df)):
        summary = df.iloc[idx]['text']
        metric = calculate_lexical_diversity_metric(summary)
        diversity_scores.append(metric)
        
    df['lexicon_diversity'] = diversity_scores
    df['cosine_similarity'] = cosine_simillarity_scores
    
    tabular_testdata_content = df[['cosine_similarity']]
    tabular_testdata_wording = df[['lexicon_diversity']]
    
    return tabular_testdata_content, tabular_testdata_wording

In [47]:
tabular_testdata_content, tabular_testdata_wording = build_test_datasets(combined_test_df)

0
1
2
3


In [48]:
from catboost import CatBoostRegressor

def evaluate(dataset, model):
    # split data    
    X = dataset
    
    # scale data
    X_scaled = scale_data(X)    
    
    # make prediction
    preds = model.predict(X_scaled)
    
    return preds

In [49]:
test_preds_content = evaluate(tabular_testdata_content, model_content)
test_preds_wording = evaluate(tabular_testdata_wording, model_wording)

### Let's save submission

In [50]:
submission = pd.DataFrame(columns=['student_id', 'content', 'wording'], data=[])
submission['student_id'] = combined_test_df['student_id']
submission['content'] = test_preds_content
submission['wording'] = test_preds_wording
submission.to_csv("submission.csv", index=False)

In [51]:
submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,-0.679123,0.205013
1,111111eeeeee,-0.397507,0.205013
2,222222cccccc,-0.492421,0.205013
3,333333dddddd,-0.679123,0.205013
