In [None]:
import numpy as np

import pandas as pd
from pandas import DataFrame

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


import re
from sklearn.feature_extraction.text import TfidfVectorizer

import transformers
import torch
from transformers import BertTokenizer

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold

import lightgbm as lgb

from fastprogress.fastprogress import  progress_bar

In [None]:
!ls ../input/textstat-pre/dist
!ls ../input/pyphen-gz

In [None]:
!cp ../input/textstat-pre/dist/textstat-0.7.1.tar . 
!cp ../input/pyphen-gz/pyphen-0.11.0.tar .
!tar -xvf textstat-0.7.1.tar
!tar -xvf pyphen-0.11.0.tar

In [None]:
cd pyphen-0.11.0

In [None]:
!python setup.py build
!python setup.py install

In [None]:
cd ../textstat-0.7.1

In [None]:
!python setup.py build
!python setup.py install

In [None]:
# !pip install pyphen --no-index --find-links=file:///kaggle/input/pyphen/ 

In [None]:
import textstat

In [None]:
cd ..

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

train_df.head()

In [None]:
train_df.standard_error.hist()

In [None]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

test_df.head()

# Cleaning Texts Function

In [None]:
def preprocess(data):
    excerpt_processed=[]
    for e in progress_bar(data['excerpt']):
        
        # find alphabets
        e = re.sub("[^a-zA-Z]", " ", e)
        
        # convert to lower case
        e = e.lower()
        
        # tokenize words
        e = nltk.word_tokenize(e)
        
        # remove stopwords
        e = [word for word in e if not word in set(stopwords.words("english"))]
        
        # lemmatization
        lemma = nltk.WordNetLemmatizer()
        e = [lemma.lemmatize(word) for word in e]
        e=" ".join(e)
        
        excerpt_processed.append(e)
        
    return excerpt_processed

# Vectorize By BERT Function

In [None]:
class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = '../input/huggingface-bert/bert-large-cased'
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128

    def vectorize(self, data):
        excerpt_bert=[]
        for sentence in progress_bar(data['excerpt']):
            sentence = re.sub("[^a-zA-Z]", " ", sentence)
            
            inp = self.tokenizer.encode(sentence)

            len_inp = len(inp)

            if len_inp >= self.max_len:
                inputs = inp[:self.max_len]
                masks = [1] * self.max_len
            else:
                inputs = inp + [0] * (self.max_len - len_inp)
                masks = [1] * len_inp + [0] * (self.max_len - len_inp)

            inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
            masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

            bert_out = self.bert_model(inputs_tensor, masks_tensor)
            seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

            if torch.cuda.is_available():    
                excerpt_bert.append(seq_out[0][0].cpu().detach().numpy())
            else:
                excerpt_bert.append(seq_out[0][0].detach().numpy())
                
        return excerpt_bert

In [None]:
def count_words_in_sentences(data):
    counts = []
    for sentence in progress_bar(data['excerpt_preprocessed']):
        words = sentence.split()
        counts.append(len(words))
        
    return counts

In [None]:
train_df['excerpt_preprocessed'] = preprocess(train_df)
test_df["excerpt_preprocessed"] = preprocess(test_df)

In [None]:
train_df.head()

# **Добавляем фичи на основе метрик и статистик**

In [None]:
def text_2_statistics(data):
    flesch_reading_ease_list, smog_index_list = [], []
    flesch_kincaid_grade_list, coleman_liau_index_list = [], []
    automated_readability_index_list, dale_chall_readability_score_list = [], []
    difficult_words_list, linsear_write_formula_list = [], []
    gunning_fog_list, text_standard_list = [], []
    fernandez_huerta_list, szigriszt_pazos_list = [], []
    gutierrez_polini_list, crawford_list = [], []
    
   # textstat.syllable_count(text)
   # textstat.lexicon_count(text, removepunct=True)
   # textstat.automated_readability_index(text)

     
    for sentence in progress_bar(data['excerpt']):
        flesch_reading_ease_list.append(textstat.flesch_reading_ease(sentence))
        smog_index_list.append(textstat.smog_index(sentence))
        flesch_kincaid_grade_list.append(textstat.flesch_kincaid_grade(sentence))
        coleman_liau_index_list.append(textstat.coleman_liau_index(sentence))
        automated_readability_index_list.append(textstat.automated_readability_index(sentence))
        dale_chall_readability_score_list.append(textstat.dale_chall_readability_score(sentence))
        difficult_words_list.append(textstat.difficult_words(sentence))
        linsear_write_formula_list.append(textstat.linsear_write_formula(sentence))
        gunning_fog_list.append(textstat.gunning_fog(sentence))
        text_standard_list.append(textstat.text_standard(sentence, float_output=True))
        fernandez_huerta_list.append(textstat.fernandez_huerta(sentence))
        szigriszt_pazos_list.append(textstat.szigriszt_pazos(sentence))
        gutierrez_polini_list.append(textstat.gutierrez_polini(sentence))
        crawford_list.append(textstat.crawford(sentence))
        
    statistics_dict = {'flesch_reading_ease':flesch_reading_ease_list,
                       'smog_index' : smog_index_list,
                       'flesch_kincaid_grade' : flesch_kincaid_grade_list,
                       'coleman_liau_index' : coleman_liau_index_list,
                       'automated_readability_index' : automated_readability_index_list, 
                       'dale_chall_readability_score' : dale_chall_readability_score_list, 
                       'difficult_words' : difficult_words_list,
                       'linsear_write_formula' : linsear_write_formula_list,
                       'gunning_fog' : gunning_fog_list,
                       'text_standard' : text_standard_list,
                       'fernandez_huerta' : fernandez_huerta_list,
                       'szigriszt_pazos' : szigriszt_pazos_list,
                       'gutierrez_polini' : gutierrez_polini_list,
                       'crawford' : crawford_list
                      }
    return statistics_dict

In [None]:
statistics_dict = text_2_statistics(train_df)

for col, val in statistics_dict.items():
    train_df[col] = val
    
test_statistics_dict = text_2_statistics(test_df)

for col, val in test_statistics_dict.items():
    test_df[col] = val
    

In [None]:
train_df.head()

# **Добавляем фичи из TF-IDF**

In [None]:
vectorizer = TfidfVectorizer(max_features=1000)

train_bags = vectorizer.fit_transform(train_df['excerpt_preprocessed'].values).toarray()
train_bag_of_words_df = pd.DataFrame(train_bags)
train_bag_of_words_df.columns = vectorizer.get_feature_names()

test_bags = vectorizer.fit_transform(test_df['excerpt_preprocessed'].values).toarray()
test_bag_of_words_df = pd.DataFrame(test_bags)
test_bag_of_words_df.columns = vectorizer.get_feature_names()

In [None]:
train_df['excerpt_word_counts_by_preprocessed'] = count_words_in_sentences(train_df)
test_df["excerpt_word_counts_by_preprocessed"] = count_words_in_sentences(test_df)

In [None]:
train_df.head()

In [None]:
# import textstat

# test_data = (
#     "Playing games has always been thought to be important to "
#     "the development of well-balanced and creative children; "
#     "however, what part, if any, they should play in the lives "
#     "of adults has never been researched that deeply. I believe "
#     "that playing games is every bit as important for adults "
#     "as for children. Not only is taking time out to play games "
#     "with our children and other adults valuable to building "
#     "interpersonal relationships but is also a wonderful way "
#     "to release built up tension."
# )

# textstat.flesch_reading_ease(test_data)
# textstat.smog_index(test_data)
# textstat.flesch_kincaid_grade(test_data)
# textstat.coleman_liau_index(test_data)
# textstat.automated_readability_index(test_data)
# textstat.dale_chall_readability_score(test_data)
# textstat.difficult_words(test_data)
# textstat.linsear_write_formula(test_data)
# textstat.gunning_fog(test_data)
# textstat.text_standard(test_data)
# textstat.fernandez_huerta(test_data)
# textstat.szigriszt_pazos(test_data)
# textstat.gutierrez_polini(test_data)
# textstat.crawford(test_data)

# **Добавляем фичи из BERT'а**

In [None]:
BSV = BertSequenceVectorizer()

train_df['excerpt_bert'] = BSV.vectorize(train_df)
test_df['excerpt_bert'] = BSV.vectorize(test_df)

In [None]:
statistics_dict.keys()  #Сгенерированные фичи

In [None]:
# train_df

In [None]:
X_train = []
for item in progress_bar(train_df['excerpt_bert']):
    X_train += [item]
X_train = pd.DataFrame(X_train)
X_train = pd.concat([X_train, train_df['excerpt_word_counts_by_preprocessed'],
                     train_df['flesch_reading_ease'],
                     train_df['smog_index'],
                     train_df['flesch_kincaid_grade'],
                     train_df['coleman_liau_index'],
                     train_df['automated_readability_index'],
                     train_df['dale_chall_readability_score'],
                     train_df['difficult_words'],
                     train_df['linsear_write_formula'],
                     train_df['gunning_fog'],
                     train_df['text_standard'],
                     train_df['fernandez_huerta'],
                     train_df['szigriszt_pazos'],
                     train_df['gutierrez_polini'],
                     train_df['crawford'],
                     train_bag_of_words_df], axis=1)

X_test = []
for item in progress_bar(test_df['excerpt_bert']):
    X_test += [item]
X_test = pd.DataFrame(X_test)
X_test = pd.concat([X_test, test_df['excerpt_word_counts_by_preprocessed'],
                             test_df['flesch_reading_ease'],
                             test_df['smog_index'],
                             test_df['flesch_kincaid_grade'],
                             test_df['coleman_liau_index'],
                             test_df['automated_readability_index'],
                             test_df['dale_chall_readability_score'],
                             test_df['difficult_words'],
                             test_df['linsear_write_formula'],
                             test_df['gunning_fog'],
                             test_df['text_standard'],
                             test_df['fernandez_huerta'],
                             test_df['szigriszt_pazos'],
                             test_df['gutierrez_polini'],
                             test_df['crawford'],test_bag_of_words_df], axis=1)

train_columns = X_train.columns.tolist()
test_columns = X_test.columns.tolist()

diff_columns = list(set(test_columns) - set(train_columns))
X_train = X_train.reindex(columns = train_columns + diff_columns)

diff_columns = list(set(train_columns) - set(test_columns))
X_test = X_test.reindex(columns = test_columns + diff_columns)

In [None]:
X_train.shape

In [None]:
y_train = train_df[['target']]

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=71)

cv = list(kf.split(X_train, y_train))

# Light GBM

In [None]:
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'seed': 42,
    'learning_rate': 0.01,
    "n_jobs": -1,
    "verbose": -1
}

pred = np.zeros(X_test.shape[0])
rmses = []

for tr_idx, val_idx in progress_bar(cv):
    x_tr, x_va = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[val_idx]

    train_set = lgb.Dataset(x_tr, y_tr)
    val_set = lgb.Dataset(x_va, y_va, reference=train_set)

    model = lgb.train(params, train_set, num_boost_round=10000, early_stopping_rounds=100,
                      valid_sets=[train_set, val_set], verbose_eval=-1)

    y_pred = model.predict(x_va)
    rmse = np.sqrt(mse(y_va, y_pred))
    rmses.append(rmse)
    
    tmp_pred = model.predict(X_test)
    pred += tmp_pred / 5
    
print("\n", "Mean Fold RMSE:", np.mean(rmses))

In [None]:
# Training until validation scores don't improve for 100 rounds
# Early stopping, best iteration is:
# [1026]	training's rmse: 0.103562	valid_1's rmse: 0.655438
# Training until validation scores don't improve for 100 rounds
# Early stopping, best iteration is:
# [991]	training's rmse: 0.109042	valid_1's rmse: 0.693471
# Training until validation scores don't improve for 100 rounds
# Early stopping, best iteration is:
# [1457]	training's rmse: 0.0512436	valid_1's rmse: 0.619726
# Training until validation scores don't improve for 100 rounds
# Early stopping, best iteration is:
# [2751]	training's rmse: 0.00703979	valid_1's rmse: 0.639461
# Training until validation scores don't improve for 100 rounds
# Early stopping, best iteration is:
# [2209]	training's rmse: 0.0156252	valid_1's rmse: 0.689038

#  Mean Fold RMSE: 0.6594268368632737

# Training until validation scores don't improve for 100 rounds
# Early stopping, best iteration is:
# [2587]	training's rmse: 0.01338	valid_1's rmse: 0.645887
# Training until validation scores don't improve for 100 rounds
# Early stopping, best iteration is:
# [1863]	training's rmse: 0.0276367	valid_1's rmse: 0.653036
# Training until validation scores don't improve for 100 rounds
# Early stopping, best iteration is:
# [750]	training's rmse: 0.157828	valid_1's rmse: 0.619037
# Training until validation scores don't improve for 100 rounds
# Early stopping, best iteration is:
# [1610]	training's rmse: 0.0376105	valid_1's rmse: 0.626817
# Training until validation scores don't improve for 100 rounds
# Early stopping, best iteration is:
# [1131]	training's rmse: 0.0805108	valid_1's rmse: 0.645462

#  Mean Fold RMSE: 0.6380477330328842

In [None]:
predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = pred
predictions.to_csv("submission.csv", index=False)

predictions