In [1]:
import random, os, warnings, math
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
from tensorflow.keras import optimizers, losses, metrics, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from transformers import TFAutoModelForSequenceClassification, TFAutoModel, AutoTokenizer


def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

seed = 0
seed_everything(seed)
sns.set(style='whitegrid')
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 150)

In [2]:
# TPU or GPU detection
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print(f'Running on TPU {tpu.master()}')
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

REPLICAS: 1


In [3]:
data = pd.read_excel("CLEAR_corpus_final.xlsx")
data.drop(['URL', 'License'], axis=1, inplace=True)
data

Unnamed: 0,ID,Author,Title,Anthology,Pub Year,Categ,Sub Cat,Lexile Band,Location,MPAA Max,...,BT_easiness,s.e.,Flesch-Reading-Ease,Flesch-Kincaid-Grade-Level,Automated Readability Index,SMOG Readability,New Dale-Chall Readability Formula,CAREC,CAREC_M,CML2RI
0,400,Carolyn Wells,Patty's Suitors,,1914.0,Lit,,900,mid,G,...,-0.340259,0.464009,81.70,5.95,7.37,8.0,6.55,0.12102,0.11952,12.097815
1,401,Carolyn Wells,Two Little Women on a Holiday,,1917.0,Lit,,700,mid,PG,...,-0.315372,0.480805,80.26,4.86,4.16,7.0,6.25,0.04921,0.04921,22.550179
2,402,Carolyn Wells,Patty Blossom,,1917.0,Lit,,900,mid,PG,...,-0.580118,0.476676,79.04,6.03,5.81,9.0,7.31,0.10172,0.09724,18.125279
3,403,CHARLES KINGSLEY,THE WATER-BABIES\nA Fairy Tale for a Land-Baby,,1863.0,Lit,,1300,mid,PG-13,...,-1.785965,0.526599,44.77,20.51,24.87,12.0,8.56,0.07491,0.08856,10.959460
4,404,Charles Kingsley,HOW THE ARGONAUTS WERE DRIVEN INTO THE UNKNOWN SEA,The Heroes\n or Greek Fairy Tales for my Children,1889.0,Lit,,1300,mid,PG,...,-1.054013,0.450007,68.07,12.06,15.47,8.0,7.00,0.06356,0.08798,3.195960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4719,8027,wikijunior,Bugs/Monarch butterfly,"""Wikijunior\n",2019.0,Info,Science,410L-600L,start,G,...,0.423388,0.511439,87.37,3.59,4.37,7.0,6.71,0.13576,0.12908,19.271483
4720,8028,wikijunior,Bugs/Walking Stick,"""Wikijunior\n",2020.0,Info,Science,610L-800L,start,G,...,-0.614142,0.475506,85.42,4.02,4.32,7.0,7.62,0.08258,0.05378,15.814468
4721,8029,wikijunior,Bugs/Black Widow,Wikijunior\n,2020.0,Info,Science,610L-800L,start,G,...,0.310336,0.508939,81.36,4.60,5.32,8.0,6.92,0.10992,0.08300,22.731214
4722,8030,wikijunior,Solids,Wikijunior\n,2014.0,Info,Science,610L-800L,start,G,...,-0.215279,0.514128,75.83,5.89,5.84,9.0,7.74,0.18951,0.19583,16.386932


In [4]:
train_size = 3750
train = data.sample(n=train_size, random_state=42)  
test = data.drop(train.index)

train.shape, test.shape

# veriler bu şekilde ayrıldı.

((3750, 26), (974, 26))

In [5]:
train

Unnamed: 0,ID,Author,Title,Anthology,Pub Year,Categ,Sub Cat,Lexile Band,Location,MPAA Max,...,BT_easiness,s.e.,Flesch-Reading-Ease,Flesch-Kincaid-Grade-Level,Automated Readability Index,SMOG Readability,New Dale-Chall Readability Formula,CAREC,CAREC_M,CML2RI
227,1204,G. P. Putnam's Sons?,The Two Melons,Tales of Wonder Every Child Should Know,2006.0,Lit,,1100,start,G,...,-0.052742,0.494226,76.81,8.29,8.55,8.0,6.66,0.05850,0.07412,14.482766
3049,5356,?,"ARTISTS' HOMES--No. 14--""BENT'S BROOK.""",SCIENTIFIC AMERICAN SUPPLEMENT NO. 299,1881.0,Info,,1300,start,G,...,-2.978524,0.511225,62.22,9.67,10.55,11.0,8.20,0.29478,0.25835,7.489067
4638,7441,"Daniel J. Mayor, Kathryn B. Cook, Thomas R. Anderson, Anna Belcher, Holly Jenkins, Pennie Lindeque, Geraint A. & Tarling, David Pond","Marine Copepods, The Wildebeest of the Ocean",,2020.0,Info,,1300,mid,G,...,-2.459246,0.502968,46.35,11.74,12.11,14.0,9.94,0.23996,0.20763,7.853715
2498,4670,Robert W. Chambers,The Messenger,Famous Modern Ghost Stories by Dorothy Scarborough et al.,1897.0,Lit,,1100,mid,G,...,-0.909047,0.499854,72.72,7.51,9.33,9.0,7.73,0.22526,0.21978,0.913673
1117,2701,simple wiki,Solvent,,2019.0,Info,Science,1100,start,G,...,-1.758207,0.497145,47.58,10.48,9.65,12.0,10.41,0.35205,0.33440,10.862071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3780,6305,Max Farrand,The Fathers of the Constitution\nVolume 13 in The Chronicles Of America Series,,1921.0,Info,,1300,mid,G,...,-1.203636,0.491719,41.96,15.01,17.26,14.0,9.08,0.28407,0.29377,5.162577
4358,7081,Johann Rudolph Wyss,THE SWISS FAMILY ROBINSON,Journeys Through Bookland V3.,1922.0,Lit,,900,mid,G,...,-0.732739,0.484800,84.73,5.33,5.97,8.0,6.54,0.16266,0.21166,15.065742
711,2260,simple wiki,Network_card,,2020.0,Info,Technology,1100,start,G,...,-2.126193,0.469691,53.18,9.15,8.49,12.0,11.30,0.37086,0.35922,18.286348
3205,5537,Emily Carter,GENTLE JESSIE AND THE WASP,"The Nursery, January 1877, Volume XXI, No. 1\n A Monthly Magazine for Youngest Readers",1877.0,Lit,,500,mid,G,...,1.316493,0.611269,92.93,3.92,2.35,6.0,5.77,0.01318,0.04296,29.652552


In [6]:
test

Unnamed: 0,ID,Author,Title,Anthology,Pub Year,Categ,Sub Cat,Lexile Band,Location,MPAA Max,...,BT_easiness,s.e.,Flesch-Reading-Ease,Flesch-Kincaid-Grade-Level,Automated Readability Index,SMOG Readability,New Dale-Chall Readability Formula,CAREC,CAREC_M,CML2RI
3,403,CHARLES KINGSLEY,THE WATER-BABIES\nA Fairy Tale for a Land-Baby,,1863.0,Lit,,1300,mid,PG-13,...,-1.785965,0.526599,44.77,20.51,24.87,12.0,8.56,0.07491,0.08856,10.959460
4,404,Charles Kingsley,HOW THE ARGONAUTS WERE DRIVEN INTO THE UNKNOWN SEA,The Heroes\n or Greek Fairy Tales for my Children,1889.0,Lit,,1300,mid,PG,...,-1.054013,0.450007,68.07,12.06,15.47,8.0,7.00,0.06356,0.08798,3.195960
5,405,Charles Madison Curry\n Erle Elsworth Clippinger,The Three Little Bears,Children's Literature\n A Textbook of Sources for Teachers and Teacher-Training Classes,1920.0,Lit,,300,mid,G,...,0.247197,0.510845,80.94,9.47,10.76,5.0,1.71,0.35370,0.36885,28.990105
9,409,Claude A. Labelle,The Ranger Boys and the Border Smugglers,,1922.0,Lit,,900,mid,PG,...,-0.371641,0.463710,79.22,6.26,7.33,9.0,6.96,0.07015,0.07326,16.497078
10,411,Cornelius Mathews,THE CELESTIAL SISTERS,The Indian Fairy Book\n From the Original Legends,1869.0,Lit,,1100,mid,PG,...,-1.238432,0.465900,71.88,9.20,10.58,9.0,6.88,0.07756,0.07593,15.388692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4707,8015,wikijunior,What are the parts of the eyes?,Wikijunior\n,2018.0,Info,Science,610L-800L,mid,G,...,0.120458,0.468218,79.96,5.38,4.44,7.0,6.29,0.26204,0.27899,21.788994
4709,8017,Jessica Fries-Gaither,Colors in the Night Sky: The Aurora,Beyond Penguins and Polar Bears\n,2008.0,Info,Science,410L-600L,start,G,...,1.049145,0.534813,86.44,3.30,3.23,7.0,5.48,0.15339,0.14626,28.602876
4710,8018,original text by Stephen Whitt\nadapted by Jessica Fries-Gaither,Life on the Ice (Cube),Beyond Penguins and Polar Bears\n,2008.0,Info,Science,410L - 600L,start,G,...,-0.204705,0.468447,95.08,1.71,1.58,5.0,6.02,0.11266,0.08106,27.878316
4716,8024,wikijunior,Introduction to The Elements,Wikijunior\n,2013.0,Info,Science,610L-800,start,G,...,0.650829,0.544809,75.71,5.80,5.20,9.0,6.78,0.22018,0.21327,26.844970


In [8]:
from transformers import AutoTokenizer, TFAutoModel

# Token kullanarak kimlik doğrulama
token = "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"

# Model ve tokenizer'ı yükleme
model_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
base_model = TFAutoModel.from_pretrained(model_name, use_auth_token=token)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.bias', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [9]:
from transformers import AutoTokenizer, TFAutoModel
import timm

# Model ve tokenizer'ı yükleme
model_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = TFAutoModel.from_pretrained(model_name)

# Modeli kullanma örneği
text = "Merhaba, bu bir örnek metindir."
tokens = tokenizer(text, return_tensors="tf")
outputs = base_model(**tokens)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.bias', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [10]:
BATCH_SIZE = 8 * REPLICAS
LEARNING_RATE = 1e-5 * REPLICAS
EPOCHS = 35
ES_PATIENCE = 7
PATIENCE = 2
N_FOLDS = 5
SEQ_LEN = 256 #300

In [11]:
# Datasets utility functions
def custom_standardization(text):
    text = text.lower() # if encoder is uncased
    text = text.strip()
    return text


def sample_target(features, target):
    mean, stddev = target
    sampled_target = tf.random.normal([], mean=tf.cast(mean, dtype=tf.float32), 
                                      stddev=tf.cast(stddev, dtype=tf.float32), dtype=tf.float32)
    
    return (features, sampled_target)
    

def get_dataset(pandas_df, tokenizer, labeled=True, ordered=False, repeated=False, 
                is_sampled=False, batch_size=32, seq_len=128):
    """
        Return a Tensorflow dataset ready for training or inference.
    """
    text = [custom_standardization(text) for text in pandas_df['excerpt']]
    
    # Tokenize inputs
    tokenized_inputs = tokenizer(text, max_length=seq_len, truncation=True, 
                                 padding='max_length', return_tensors='tf')
    
    if labeled:
        dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs['input_ids'], 
                                                      'attention_mask': tokenized_inputs['attention_mask']}, 
                                                      (pandas_df['target'], pandas_df['standard_error'])))
        if is_sampled:
            dataset = dataset.map(sample_target, num_parallel_calls=tf.data.AUTOTUNE)
    else:
        dataset = tf.data.Dataset.from_tensor_slices({'input_ids': tokenized_inputs['input_ids'], 
                                                      'attention_mask': tokenized_inputs['attention_mask']})
        
    if repeated:
        dataset = dataset.repeat()
    if not ordered:
        dataset = dataset.shuffle(1024)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    
    return dataset


def plot_metrics(history):
    metric_list = list(history.keys())
    size = len(metric_list)//2
    fig, axes = plt.subplots(size, 1, sharex='col', figsize=(20, size * 5))
    axes = axes.flatten()
    
    for index in range(len(metric_list)//2):
        metric_name = metric_list[index]
        val_metric_name = metric_list[index+size]
        axes[index].plot(history[metric_name], label='Train %s' % metric_name)
        axes[index].plot(history[val_metric_name], label='Validation %s' % metric_name)
        axes[index].legend(loc='best', fontsize=16)
        axes[index].set_title(metric_name)

    plt.xlabel('Epochs', fontsize=16)
    sns.despine()
    plt.show()

In [13]:
display(train.sort_values(by=['target']).head())

KeyError: 'target'

In [14]:
train.columns.tolist()

['ID',
 'Author',
 'Title',
 'Anthology',
 'Pub Year',
 'Categ',
 'Sub Cat',
 'Lexile Band',
 'Location',
 'MPAA Max',
 'MPAA #Max',
 'MPAA# Avg',
 'Excerpt',
 'Google WC',
 'Sentence Count',
 'Paragraphs',
 'BT_easiness',
 's.e.',
 'Flesch-Reading-Ease',
 'Flesch-Kincaid-Grade-Level',
 'Automated Readability Index',
 'SMOG Readability',
 'New Dale-Chall Readability Formula',
 'CAREC',
 'CAREC_M',
 'CML2RI']

In [15]:
train

Unnamed: 0,ID,Author,Title,Anthology,Pub Year,Categ,Sub Cat,Lexile Band,Location,MPAA Max,...,BT_easiness,s.e.,Flesch-Reading-Ease,Flesch-Kincaid-Grade-Level,Automated Readability Index,SMOG Readability,New Dale-Chall Readability Formula,CAREC,CAREC_M,CML2RI
227,1204,G. P. Putnam's Sons?,The Two Melons,Tales of Wonder Every Child Should Know,2006.0,Lit,,1100,start,G,...,-0.052742,0.494226,76.81,8.29,8.55,8.0,6.66,0.05850,0.07412,14.482766
3049,5356,?,"ARTISTS' HOMES--No. 14--""BENT'S BROOK.""",SCIENTIFIC AMERICAN SUPPLEMENT NO. 299,1881.0,Info,,1300,start,G,...,-2.978524,0.511225,62.22,9.67,10.55,11.0,8.20,0.29478,0.25835,7.489067
4638,7441,"Daniel J. Mayor, Kathryn B. Cook, Thomas R. Anderson, Anna Belcher, Holly Jenkins, Pennie Lindeque, Geraint A. & Tarling, David Pond","Marine Copepods, The Wildebeest of the Ocean",,2020.0,Info,,1300,mid,G,...,-2.459246,0.502968,46.35,11.74,12.11,14.0,9.94,0.23996,0.20763,7.853715
2498,4670,Robert W. Chambers,The Messenger,Famous Modern Ghost Stories by Dorothy Scarborough et al.,1897.0,Lit,,1100,mid,G,...,-0.909047,0.499854,72.72,7.51,9.33,9.0,7.73,0.22526,0.21978,0.913673
1117,2701,simple wiki,Solvent,,2019.0,Info,Science,1100,start,G,...,-1.758207,0.497145,47.58,10.48,9.65,12.0,10.41,0.35205,0.33440,10.862071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3780,6305,Max Farrand,The Fathers of the Constitution\nVolume 13 in The Chronicles Of America Series,,1921.0,Info,,1300,mid,G,...,-1.203636,0.491719,41.96,15.01,17.26,14.0,9.08,0.28407,0.29377,5.162577
4358,7081,Johann Rudolph Wyss,THE SWISS FAMILY ROBINSON,Journeys Through Bookland V3.,1922.0,Lit,,900,mid,G,...,-0.732739,0.484800,84.73,5.33,5.97,8.0,6.54,0.16266,0.21166,15.065742
711,2260,simple wiki,Network_card,,2020.0,Info,Technology,1100,start,G,...,-2.126193,0.469691,53.18,9.15,8.49,12.0,11.30,0.37086,0.35922,18.286348
3205,5537,Emily Carter,GENTLE JESSIE AND THE WASP,"The Nursery, January 1877, Volume XXI, No. 1\n A Monthly Magazine for Youngest Readers",1877.0,Lit,,500,mid,G,...,1.316493,0.611269,92.93,3.92,2.35,6.0,5.77,0.01318,0.04296,29.652552


In [16]:
data

Unnamed: 0,ID,Author,Title,Anthology,Pub Year,Categ,Sub Cat,Lexile Band,Location,MPAA Max,...,BT_easiness,s.e.,Flesch-Reading-Ease,Flesch-Kincaid-Grade-Level,Automated Readability Index,SMOG Readability,New Dale-Chall Readability Formula,CAREC,CAREC_M,CML2RI
0,400,Carolyn Wells,Patty's Suitors,,1914.0,Lit,,900,mid,G,...,-0.340259,0.464009,81.70,5.95,7.37,8.0,6.55,0.12102,0.11952,12.097815
1,401,Carolyn Wells,Two Little Women on a Holiday,,1917.0,Lit,,700,mid,PG,...,-0.315372,0.480805,80.26,4.86,4.16,7.0,6.25,0.04921,0.04921,22.550179
2,402,Carolyn Wells,Patty Blossom,,1917.0,Lit,,900,mid,PG,...,-0.580118,0.476676,79.04,6.03,5.81,9.0,7.31,0.10172,0.09724,18.125279
3,403,CHARLES KINGSLEY,THE WATER-BABIES\nA Fairy Tale for a Land-Baby,,1863.0,Lit,,1300,mid,PG-13,...,-1.785965,0.526599,44.77,20.51,24.87,12.0,8.56,0.07491,0.08856,10.959460
4,404,Charles Kingsley,HOW THE ARGONAUTS WERE DRIVEN INTO THE UNKNOWN SEA,The Heroes\n or Greek Fairy Tales for my Children,1889.0,Lit,,1300,mid,PG,...,-1.054013,0.450007,68.07,12.06,15.47,8.0,7.00,0.06356,0.08798,3.195960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4719,8027,wikijunior,Bugs/Monarch butterfly,"""Wikijunior\n",2019.0,Info,Science,410L-600L,start,G,...,0.423388,0.511439,87.37,3.59,4.37,7.0,6.71,0.13576,0.12908,19.271483
4720,8028,wikijunior,Bugs/Walking Stick,"""Wikijunior\n",2020.0,Info,Science,610L-800L,start,G,...,-0.614142,0.475506,85.42,4.02,4.32,7.0,7.62,0.08258,0.05378,15.814468
4721,8029,wikijunior,Bugs/Black Widow,Wikijunior\n,2020.0,Info,Science,610L-800L,start,G,...,0.310336,0.508939,81.36,4.60,5.32,8.0,6.92,0.10992,0.08300,22.731214
4722,8030,wikijunior,Solids,Wikijunior\n,2014.0,Info,Science,610L-800L,start,G,...,-0.215279,0.514128,75.83,5.89,5.84,9.0,7.74,0.18951,0.19583,16.386932
