In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/feedback-prize-english-language-learning/sample_submission.csv
/kaggle/input/feedback-prize-english-language-learning/train.csv
/kaggle/input/feedback-prize-english-language-learning/test.csv


In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split, KFold
import nltk

In [3]:
class Config:
    vocab_size = 3000
    embed_size = int(vocab_size ** 0.5)
    batch_size = 32
    epochs = 30
    use_k_fold = True
    target_columns = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
    dataset_path = "../input/feedback-prize-english-language-learning"
config = Config()

In [4]:
train = pd.read_csv(f"{config.dataset_path}/train.csv")
test = pd.read_csv(f"{config.dataset_path}/test.csv")

In [5]:
train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [6]:
train["text"] = train["full_text"].apply(lambda sentence: " ". join(nltk.word_tokenize(sentence.lower())))
test["text"] = test["full_text"].apply(lambda sentence: " ". join(nltk.word_tokenize(sentence.lower())))
vectorizor = keras.layers.TextVectorization(
    max_tokens=config.vocab_size, 
    output_mode="tf-idf", 
    ngrams=3
)
vectorizor.adapt(list(train["text"]) + list(test["text"]))

In [7]:
def get_model():
    model = keras.Sequential([
        keras.Input(shape=(), dtype="string"),
        vectorizor,
        keras.layers.Dense(32, kernel_initializer='he_uniform', activation='sigmoid'),
        keras.layers.Dropout(0.1),
        keras.layers.Dense(len(config.target_columns))
    ])
    rmse = tf.keras.metrics.RootMeanSquaredError(name="rmse")
    model.compile(loss="mse", optimizer="adam", metrics=[rmse])
    return model

In [8]:
model = get_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 3000)             1         
 torization)                                                     
                                                                 
 dense (Dense)               (None, 32)                96032     
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 6)                 198       
                                                                 
Total params: 96,231
Trainable params: 96,230
Non-trainable params: 1
_________________________________________________________________


In [9]:
keras.backend.clear_session()
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
models = []
rmses = []
for i, (train_indices, valid_indices) in enumerate(kfold.split(train)):
    x_train = train.iloc[train_indices]["text"]
    y_train = train.iloc[train_indices][config.target_columns]
    x_val = train.iloc[valid_indices]["text"]
    y_val = train.iloc[valid_indices][config.target_columns]
    model_path = f"model_{i}.tf"
    model = get_model()
    rmse = tf.keras.metrics.RootMeanSquaredError(name="rmse")
    checkpoint = keras.callbacks.ModelCheckpoint(model_path, monitor="val_rmse", mode="min", save_best_only=True, save_weights_only=True)
    early_stop = keras.callbacks.EarlyStopping(monitor="val_rmse", mode="min", patience=5)
    model.compile(loss="mse", optimizer="adam", metrics=['accuracy',rmse])
    history = model.fit(
        x_train, y_train, 
        batch_size=config.batch_size, 
        epochs=config.epochs,
        validation_data=(x_val, y_val),
        callbacks=[checkpoint, early_stop]
    )
    model.load_weights(model_path)
    result = model.evaluate(x_val, y_val)
    print("Loss:", result[0], "RMSE:", result[1])
    rmses.append(result[1])
    models.append(model) 
    if not config.use_k_fold:
        break
print(f"Mean RMSE:{np.mean(rmses)}")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Loss: 0.327590674161911 RMSE: 0.19667944312095642
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Loss: 0.3184068500995636 RMSE: 0.30051150918006897
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Loss: 0.3245217502117157 RMSE: 0.24808184802532196
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10

In [10]:
preds = []
for model in models:
    preds.append(model.predict(test["text"]))
pred = np.mean(preds, axis=0)
submission = pd.DataFrame({
    "text_id": test["text_id"]
})
for i in range(len(config.target_columns)):
    column = config.target_columns[i]
    submission[column] = pred[:, i]
pred = np.mean(preds, axis=0)
submission.to_csv("submission.csv", index=False)



In [20]:
preds = []
for model in models:
    preds.append(model.predict(train["full_text"]))
pred = np.mean(preds, axis=0)
submission = pd.DataFrame({
    "text_id": train["text_id"]
})
for i in range(len(config.target_columns)):
    column = config.target_columns[i]
    submission[column] = pred[:, i]
submission



Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,3.299237,3.144557,3.307819,3.196914,3.223613,3.123274
1,0022683E9EA5,2.732351,2.547126,2.743189,2.500007,2.399894,2.413464
2,00299B378633,2.856598,2.775280,3.012170,2.837618,2.754196,2.802880
3,003885A45F42,3.276055,3.207409,3.325592,3.306266,3.302924,3.209436
4,0049B1DF5CCC,2.918462,2.857034,3.127614,2.958375,2.805789,2.775101
...,...,...,...,...,...,...,...
3906,FFD29828A873,3.138339,3.079617,3.276016,3.163969,3.089429,3.028326
3907,FFD9A83B0849,3.522869,3.429275,3.518063,3.520644,3.471041,3.503082
3908,FFDC4011AC9C,3.029586,2.949958,3.233208,3.104231,3.103633,2.933509
3909,FFE16D704B16,3.515909,3.379911,3.512368,3.486501,3.438157,3.464025


In [35]:
import numpy as np
from sklearn.metrics import mean_squared_error

y_true=list(train['cohesion'])
y_pred=list(submission['cohesion'])
mean_squared_error(y_true,y_pred)



0.3884705593445148