In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
import transformers
from transformers import *
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from transformers import TFRobertaForSequenceClassification, RobertaConfig, RobertaTokenizer
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordTokenizer 


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']
Loading custom CUDA kernels...
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [2]:
# Download NLTK resources
nltk.download('punkt')
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nl

In [3]:
df = pd.read_csv('/kaggle/input/cumulative-reviews-and-reviews/cumulative.csv')
df=df[df['rating'] != '|']
df['rating'] = df['rating'].astype(float).round()
df['review'] = df['review'].str.lower()
df['review'] = df['review'].str.replace('&', 'and')

In [None]:
def remove_urls(review):
    url_pattern = r'https?://\S+|www\.\S+'
    return re.sub(url_pattern, '', review)

df['review'] = df['review'].apply(lambda x: remove_urls(x))

def remove_non_alphanumeric(review):
    return re.sub(r'[^a-zA-Z0-9.]', ' ', review)

df['review'] = df['review'].apply(remove_non_alphanumeric)

tree_tokenizer = TreebankWordTokenizer()
def get_tree_tokens(x):
    x = tree_tokenizer.tokenize(x)
    x = ' '.join(x)
    return x

df.review=df.review.apply(get_tree_tokens)

print(df.head())

In [None]:
def lemmatize_and_remove_stopwords(review):
    words = nltk.word_tokenize(review) 
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(filtered_words)

df['review'] = df['review'].apply(lemmatize_and_remove_stopwords)

print(df.head())

In [None]:
def to_tokens(input_review, tokenizer):
    output = tokenizer.encode_plus(input_review, max_length=90, pad_to_max_length=True)
    return output

def select_field(features, field):
    return [feature[field] for feature in features]

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
def preprocess_data(tokenizer, train_df, test_df):
    train_review = train_df['review'].apply(clean_tweet)
    test_review = test_df['review'].apply(clean_tweet)
    train_encoded = train_review.apply(lambda x: to_tokens(x, tokenizer))
    test_encoded = test_review.apply(lambda x: to_tokens(x, tokenizer))

    #create attention masks
    input_ids_train = np.array(select_field(train_encoded, 'input_ids'))
    attention_masks_train = np.array(select_field(train_encoded, 'attention_mask'))

    input_ids_test = np.array(select_field(test_encoded, 'input_ids'))
    attention_masks_test = np.array(select_field(test_encoded, 'attention_mask'))

    # concatonate masks
    train_X = [input_ids_train, attention_masks_train]
    test_X = [input_ids_test, attention_masks_test]
    
    train_y = tf.keras.utils.to_categorical(train_df['review'].values.reshape(-1, 1))

    return train_X, train_y, test_X

In [None]:
def load_pretrained_model( task='sentiment_5_class', learning_rate=3e-5, epsilon=1e-8):
  
  model_metrics = [
        tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision', class_id=1),  
        tf.keras.metrics.Recall(name='recall', class_id=1),        
        tf.keras.metrics.Precision(name='precision', class_id=2),  
        tf.keras.metrics.Recall(name='recall', class_id=2),        
        tf.keras.metrics.Precision(name='precision', class_id=3),  
        tf.keras.metrics.Recall(name='recall', class_id=3),
        tf.keras.metrics.Precision(name='precision', class_id=4),  
        tf.keras.metrics.Recall(name='recall', class_id=4),
        tf.keras.metrics.Precision(name='precision', class_id=5),  
        tf.keras.metrics.Recall(name='recall', class_id=5),
        tf.keras.metrics.AUC(name='auc')
  ]

  
  config_class, model_class, tokenizer_class, model_name = RobertaConfig, TFRobertaForSequenceClassification, RobertaTokenizer, 'roberta-large'

  config = config_class.from_pretrained(model_name, num_labels=5, finetuning_task=task)


  model = model_class.from_pretrained(model_name)
  optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon, clipnorm=1.0)
  loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.CategoricalAccuracy('accuracy')
  model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
  #model.summary()

  tokenizer = tokenizer_class.from_pretrained(model_name, lower_case = False)

  return config, model, tokenizer

In [None]:
sentiment_counts = train_df.iloc[train_idx, :].rating.value_counts()
total_samples = sentiment_counts.sum()

class_weights = {i: total_samples / sentiment_counts[i] for i in range(5)}
class_weights = {key: val / sum(class_weights.values()) for key, val in class_weights.items()}


In [None]:
# # load model, process data for model
# _, _, tokenizer = load_pretrained_model(model_class='roberta', model_name='roberta-base', learning_rate=2e-5, lower_case=False)
# train_X, train_y, test_X = preprocess_data(tokenizer=tokenizer, train_df=train_df, test_df=test_df)
# ​
# ​
# kf = KFold(n_splits=6)
# test_preds = []
# i = 0
# for train_idx, test_idx in kf.split(train_X[0]):
#     i+=1
#     if i not in [1, 5]: #only do 2 folds to save time
#         continue
#     train_split_X = [train_X[i][train_idx] for i in range(len(train_X))]
#     test_split_X = [train_X[i][test_idx] for i in range(len(train_X))]
# ​
#     train_split_y = train_y[train_idx]
#     test_split_y = train_y[test_idx]
#     #create class weights to account for inbalance
#     sentiment_counts = train_df.iloc[train_idx, :].review.value_counts()
#     total_samples = sentiment_counts.sum()
# ​
#     class_weights = {i: total_samples / sentiment_counts[i] for i in range(5)}
#     class_weights = {key: val / sum(class_weights.values()) for key, val in class_weights.items()}
# ​
#     K.clear_session()
#     config, model, tokenizer = load_pretrained_model(model_class='roberta', model_name='roberta-base', learning_rate=2e-5, lower_case=False)
# ​
#     # fit, test model
#     model.fit(train_split_X, train_split_y, batch_size=64, epochs=3, class_weight=class_weight, validation_data=(test_split_X, test_split_y))
# ​
#     val_preds = model.predict(test_split_X, batch_size=32, verbose=1)
#     val_preds = np.argmax(val_preds, axis=1).flatten()
#     print(metrics.accuracy_score(train_df.iloc[test_idx, :].review.values, val_preds))
# ​
#     preds1 = model.predict(test_X, batch_size=32, verbose=1)
#     test_preds.append(preds1)
    

    # Load your tokenizer and preprocess your data here

kf = KFold(n_splits=6)
test_preds = []

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(train_X[0])):
    if fold_idx not in [1, 5]:  # only do 2 folds to save time
        continue

    train_split_X = [train_X[i][train_idx] for i in range(len(train_X))]
    test_split_X = [train_X[i][test_idx] for i in range(len(train_X))]

    train_split_y = train_y[train_idx]
    test_split_y = train_y[test_idx]

    # Create class weights to account for imbalance
    sentiment_counts = train_df.iloc[train_idx, :]['review'].value_counts()
    total_samples = sentiment_counts.sum()

    class_weights = {i: total_samples / sentiment_counts[i] for i in range(5)}
    class_weights = {key: val / sum(class_weights.values()) for key, val in class_weights.items()}

    K.clear_session()
    config, model, tokenizer = load_pretrained_model(model_class='roberta', model_name='roberta-base', learning_rate=2e-5, lower_case=False)

    # Fit, test model
    model.fit(train_split_X, train_split_y, batch_size=64, epochs=3, class_weight=class_weights, validation_data=(test_split_X, test_split_y))

    val_preds = model.predict(test_split_X, batch_size=32, verbose=1)
    val_preds = np.argmax(val_preds, axis=1).flatten()
    accuracy = metrics.accuracy_score(train_df.iloc[test_idx, :]['review'].values, val_preds)
    print(f"Fold {fold_idx + 1} Validation Accuracy: {accuracy:.4f}")

    preds1 = model.predict(test_X, batch_size=32, verbose=1)
    test_preds.append(preds1)
