In [None]:
import re
import string
import numpy as np 
import random
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords

from tqdm import tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.


In [None]:
def random_colours(number_of_colors):
    '''
    Simple function for random colours generation.
    Input:
        number_of_colors - integer value indicating the number of colours which are going to be generated.
    Output:
        Color in the following format: ['#E86DA4'] .
    '''
    colors = []
    for i in range(number_of_colors):
        colors.append("#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
    return colors

In [None]:
train = pd.read_csv(r"C:/Users/me1awq/PhD/docsim/datasets/open_source/SICK_train2.csv")
test = pd.read_csv(r"C:/Users/me1awq/PhD/docsim/datasets/open_source/SICK_train3.csv")
ss = pd.read_csv(r"C:/Users/me1awq/PhD/docsim/datasets/open_source/SICK_train4.csv")

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.info()

In [None]:
train.dropna(inplace=True)

In [None]:
test.info()

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
temp = train.groupby('entailment_judgment').count()['sentence_A'].reset_index().sort_values(by='sentence_A',ascending=False)
temp.style.background_gradient(cmap='Purples')

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='entailment_judgment',data=train)

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
results_jaccard=[]

for ind,row in train.iterrows():
    sentence1 = row.sentence_A
    sentence2 = row.sentence_B

    jaccard_score = jaccard(sentence1,sentence2)
    results_jaccard.append([sentence1,sentence2,jaccard_score])

In [None]:
jaccard = pd.DataFrame(results_jaccard,columns=["row.sentence_A","row.sentence_B","jaccard_score"])
train = train.merge(jaccard, how="outer", left_index=True, right_index=True)

In [None]:
train['Num_words_ST'] = train['row.sentence_B'].apply(lambda x:len(str(x).split())) #Number Of words in Selected Text
train['Num_word_text'] = train['row.sentence_A'].apply(lambda x:len(str(x).split())) #Number Of words in main text
train['difference_in_words'] = train['Num_word_text'] - train['Num_words_ST'] #Difference in Number of words text and Selected Text

In [None]:
train.head()

In [None]:
hist_data = [train['Num_words_ST'],train['Num_word_text']]

group_labels = ['row.sentence_B', 'row.sentence_A']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels,show_curve=False)
fig.update_layout(title_text='Distribution of Number Of words')
fig.update_layout(
    autosize=False,
    width=900,
    height=700,
    paper_bgcolor="LightSteelBlue",
)
fig.show()

In [None]:
plt.figure(figsize=(12,6))
p1=sns.kdeplot(train['Num_words_ST'], shade=True, color="r").set_title('Kernel Distribution of Number Of words')
p1=sns.kdeplot(train['Num_word_text'], shade=True, color="b")

In [None]:
plt.figure(figsize=(12,6))
p1=sns.kdeplot(train[train['entailment_judgment']=='ENTAILMENT']['difference_in_words'], shade=True, color="b").set_title('Kernel Distribution of Difference in Number Of words')
p2=sns.kdeplot(train[train['entailment_judgment']=='CONTRADICTION']['difference_in_words'], shade=True, color="r")
p3=sns.kdeplot(train[train['entailment_judgment']=='NEUTRAL']['difference_in_words'], shade=True, color="g")

In [None]:
k = train[train['Num_word_text']<=20]

In [None]:
k.groupby('entailment_judgment').mean()['jaccard_score']

In [None]:
k[k['entailment_judgment']=='ENTAILMENT']

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
train['sentence_A'] = train['sentence_A'].apply(lambda x:clean_text(x))
train['sentence_B'] = train['sentence_B'].apply(lambda x:clean_text(x))

In [None]:
train.head()

In [None]:
train['temp_list'] = train['sentence_B'].apply(lambda x:str(x).split())
top = Counter([item for sublist in train['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')

In [None]:
fig = px.bar(temp, x="count", y="Common_words", title='Commmon Words in sentence_B', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [None]:
def remove_stopword(x):
    return [y for y in x if y not in stopwords.words('english')]
train['temp_list'] = train['temp_list'].apply(lambda x:remove_stopword(x))

In [None]:
top = Counter([item for sublist in train['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp = temp.iloc[1:,:]
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Purples')

In [None]:
fig = px.treemap(temp, path=['Common_words'], values='count',title='Tree of Most Common Words')
fig.show()

In [None]:
train['temp_list1'] = train['sentence_A'].apply(lambda x:str(x).split()) #List of words in every row for text
train['temp_list1'] = train['temp_list1'].apply(lambda x:remove_stopword(x)) #Removing Stopwords

In [None]:
top = Counter([item for sublist in train['temp_list1'] for item in sublist])
temp = pd.DataFrame(top.most_common(25))
temp = temp.iloc[1:,:]
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')

In [None]:
fig = px.bar(temp, x="count", y="Common_words", title='Commmon Words in Text', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [None]:
Positive_sent = train[train['entailment_judgment']=='ENTAILMENT']
Negative_sent = train[train['entailment_judgment']=='CONTRADICTION']
Neutral_sent = train[train['entailment_judgment']=='NEUTRAL']

In [None]:
#MosT common positive words
top = Counter([item for sublist in Positive_sent['temp_list'] for item in sublist])
temp_positive = pd.DataFrame(top.most_common(30))
temp_positive.columns = ['Common_words','count']
temp_positive.style.background_gradient(cmap='Greens')

In [None]:
fig = px.bar(temp_positive, x="count", y="Common_words", title='Most Commmon Positive Words', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [None]:
#MosT common negative words
top = Counter([item for sublist in Negative_sent['temp_list'] for item in sublist])
temp_negative = pd.DataFrame(top.most_common(20))
temp_negative = temp_negative.iloc[1:,:]
temp_negative.columns = ['Common_words','count']
temp_negative.style.background_gradient(cmap='Reds')

In [None]:
fig = px.treemap(temp_negative, path=['Common_words'], values='count',title='Tree Of Most Common Contradicting Words')
fig.show()

In [None]:
#MosT common Neutral words
top = Counter([item for sublist in Neutral_sent['temp_list'] for item in sublist])
temp_neutral = pd.DataFrame(top.most_common(20))
temp_neutral = temp_neutral.loc[1:,:]
temp_neutral.columns = ['Common_words','count']
temp_neutral.style.background_gradient(cmap='Reds')

In [None]:

fig = px.bar(temp_neutral, x="count", y="Common_words", title='Most Commmon Neutral Words', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [None]:
fig = px.treemap(temp_neutral, path=['Common_words'], values='count',title='Tree Of Most Common Neutral Words')
fig.show()

In [None]:
raw_text = [word for word_list in train['temp_list1'] for word in word_list]

In [None]:
def words_unique(entailment_judgment,numwords,raw_words):
    '''
    Input:
        segment - Segment category (ex. 'Neutral');
        numwords - how many specific words do you want to see in the final result; 
        raw_words - list  for item in train_data[train_data.segments == segments]['temp_list1']:
    Output: 
        dataframe giving information about the name of the specific ingredient and how many times it occurs in the chosen cuisine (in descending order based on their counts)..

    '''
    allother = []
    for item in train[train.entailment_judgment != entailment_judgment]['temp_list1']:
        for word in item:
            allother .append(word)
    allother  = list(set(allother ))
    
    specificnonly = [x for x in raw_text if x not in allother]
    
    mycounter = Counter()
    
    for item in train[train.entailment_judgment == entailment_judgment]['temp_list1']:
        for word in item:
            mycounter[word] += 1
    keep = list(specificnonly)
    
    for word in list(mycounter):
        if word not in keep:
            del mycounter[word]
    
    Unique_words = pd.DataFrame(mycounter.most_common(numwords), columns = ['words','count'])
    
    return Unique_words

In [None]:
Unique_Positive= words_unique('ENTAILMENT', 20, raw_text)
print("The top 20 unique words in Entailment Sentences are:")
Unique_Positive.style.background_gradient(cmap='Greens')

In [None]:
fig = px.treemap(Unique_Positive, path=['words'], values='count',title='Tree Of Unique Positive Words')
fig.show()

In [None]:
from palettable.colorbrewer.qualitative import Pastel1_7
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.pie(Unique_Positive['count'], labels=Unique_Positive.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Entailment Words')
plt.show()

In [None]:
Unique_Negative= words_unique('CONTRADICTION', 10, raw_text)
print("The top 10 unique words in Contradicting Words are:")
Unique_Negative.style.background_gradient(cmap='Reds')

In [None]:
from palettable.colorbrewer.qualitative import Pastel1_7
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(Unique_Negative['count'], labels=Unique_Negative.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Negative Words')
plt.show()

In [None]:
Unique_Neutral= words_unique('NEUTRAL', 10, raw_text)
print("The top 10 unique words in Neutral Tweets are:")
Unique_Neutral.style.background_gradient(cmap='Oranges')

In [None]:
from palettable.colorbrewer.qualitative import Pastel1_7
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.pie(Unique_Neutral['count'], labels=Unique_Neutral.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Neutral Words')
plt.show()

In [None]:
df_train = pd.read_csv((r"C:/Users/me1awq/PhD/docsim/datasets/open_source/SICK_train2.csv"))
df_test = pd.read_csv((r"C:/Users/me1awq/PhD/docsim/datasets/open_source/SICK_train3.csv"))
df_submission = pd.read_csv((r"C:/Users/me1awq/PhD/docsim/datasets/open_source/SICK_train4.csv"))

In [None]:
df_train['Num_words_text'] = df_train['sentence_A'].apply(lambda x:len(str(x).split())) #Number Of words in main Text in train set

In [None]:
df_train = df_train[df_train['Num_words_text']>=3]

In [None]:
def save_model(output_dir, nlp, new_model_name):
    ''' This Function Saves model to 
    given output directory'''
    
    output_dir = r'C:/Users/me1awq/PhD/docsim/train_results/'
    if output_dir is not None:        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        nlp.meta["name"] = new_model_name
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [None]:
from spacy.training import Example
import random
from tqdm import tqdm

def train(train_data, output_dir, n_iter=20, model=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(output_dir)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.add_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.resume_training()

        # prepare the training data in the required format for spacy
        examples = []
        for text, annots in train_data:
            examples.append(Example.from_dict(nlp.make_doc(text), annots))
        
        # train the model
        for itn in tqdm(range(n_iter)):
            random.shuffle(examples)
            losses = {}
            batches = spacy.util.minibatch(examples, size=spacy.util.compounding(4.0, 500.0, 1.001))
            for batch in batches:
                nlp.update(batch, sgd=optimizer, drop=0.5, losses=losses)
            print("Losses", losses)
    
    save_model(output_dir, nlp, 'st_ner')



In [None]:
def get_model_out_path(entailment_judgment):
    '''
    Returns Model output path
    '''
    model_out_path = None
    if entailment_judgment == 'ENTAILMENT':
        model_out_path = r'C:/Users/me1awq/PhD/docsim/train_results/pos'
    elif entailment_judgment == 'CONTRADICTION':
        model_out_path = r'C:/Users/me1awq/PhD/docsim/train_results/neg'
    return model_out_path

In [None]:
def get_training_data(entailment_judgment):
    '''
    Returns Trainong data in the format needed to train spacy NER
    '''
    train_data = []
    for index, row in df_train.iterrows():
        if row.entailment_judgment == entailment_judgment:
            sentence_B = row.sentence_B
            sentence_A = row.sentence_A
            start = sentence_A.find(sentence_B)
            end = start + len(sentence_B)
            train_data.append((sentence_A, {"entities": [[start, end, 'sentence_B']]}))
    return train_data

In [None]:
sentiment = 'ENTAILMENT'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
# For DEmo Purposes I have taken 3 iterations you can train the model as you want
train(train_data, model_path, n_iter=3, model=None)

In [None]:
sentiment = 'CONTRADICTION'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)

train(train_data, model_path, n_iter=3, model=None)

In [None]:
def predict_entities(sentence_A, model):
    doc = model(sentence_A)
    ent_array = []
    for ent in doc.ents:
        start = sentence_A.find(ent.sentence_A)
        end = start + len(ent.sentence_A)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    sentence_B = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
    return sentence_B

In [None]:
selected_texts = []
MODELS_BASE_PATH = r'C:/Users/me1awq/PhD/docsim/train_results/'

if MODELS_BASE_PATH is not None:
    print("Loading Models  from ", MODELS_BASE_PATH)
    model_pos = spacy.load(MODELS_BASE_PATH + 'pos')
    model_neg = spacy.load(MODELS_BASE_PATH + 'neg')
        
    for index, row in df_test.iterrows():
        sentence_A = row.sentence_A
        output_str = ""
        if row.sentiment == 'NEUTRAL' or len(text.split()) <= 2:
            selected_texts.append(sentence_A)
        elif row.sentiment == 'ENTAILMENT':
            selected_texts.append(predict_entities(sentence_A, model_pos))
        else:
            selected_texts.append(predict_entities(sentence_A, model_neg))
        
df_test['selected_text'] = selected_texts

# Split the dataset

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from keras.preprocessing import text, sequence
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
full_df = pd.read_csv(r"C:\Users\Omer\Desktop\sick_d2.csv", on_bad_lines='skip')

In [None]:
full_df

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into two parts, with 70% of the data for training and 30% for testing
train_data, test_data = train_test_split(full_df, test_size=0.3, random_state=42)

# Split the training data into two parts, with 60% of the data for training and 40% for validation
train_data, validation_data = train_test_split(train_data, test_size=0.4, random_state=42)


In [None]:
# After running this code, you should have four variables: train_data, 
#validation_data, and test_data, which contain the corresponding splits 
#of your main dataset. You can use these variables to train, validate, 
#and test your machine learning model.

In [None]:
max_features = 10000
maxlen = 300

In [None]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_data)
tokenized_train = tokenizer.texts_to_sequences(train_data)
x_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

In [None]:
tokenized_test = tokenizer.texts_to_sequences(x_test)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

# Training model

In [None]:
#Defining Neural Network
model = Sequential()
#Non-trainable embeddidng layer
model.add(Embedding(max_features, output_dim=embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False))
#LSTM 
model.add(LSTM(units=128 , return_sequences = True , recurrent_dropout = 0.25 , dropout = 0.25))
model.add(LSTM(units=64 , recurrent_dropout = 0.1 , dropout = 0.1))
model.add(Dense(units = 32 , activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=keras.optimizers.Adam(lr = 0.01), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(x_train, y_train, batch_size = batch_size , validation_data = (X_test,y_test) , epochs = epochs , callbacks = [learning_rate_reduction])

# ANALYSIS AFTER TRAINING OF MODEL

In [None]:
print("Accuracy of the model on Training Data is - " , model.evaluate(x_train,y_train)[1]*100 , "%")
print("Accuracy of the model on Testing Data is - " , model.evaluate(X_test,y_test)[1]*100 , "%")

In [None]:
epochs = [i for i in range(10)]
fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
val_acc = history.history['val_accuracy']
val_loss = history.history['val_loss']
fig.set_size_inches(20,10)

ax[0].plot(epochs , train_acc , 'go-' , label = 'Training Accuracy')
ax[0].plot(epochs , val_acc , 'ro-' , label = 'Testing Accuracy')
ax[0].set_title('Training & Testing Accuracy')
ax[0].legend()
ax[0].set_xlabel("Epochs")
ax[0].set_ylabel("Accuracy")

ax[1].plot(epochs , train_loss , 'go-' , label = 'Training Loss')
ax[1].plot(epochs , val_loss , 'ro-' , label = 'Testing Loss')
ax[1].set_title('Training & Testing Loss')
ax[1].legend()
ax[1].set_xlabel("Epochs")
ax[1].set_ylabel("Loss")
plt.show()

In [None]:
pred = model.predict_classes(X_test)
pred[:5]

In [None]:
print(classification_report(y_test, pred, target_names = ['Fake','Not Fake']))

In [None]:
cm = confusion_matrix(y_test,pred)
cm

In [None]:
cm = pd.DataFrame(cm , index = ['Fake','Original'] , columns = ['Fake','Original'])

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Fake','Original'] , yticklabels = ['Fake','Original'])
plt.xlabel("Predicted")
plt.ylabel("Actual")

# Chat GPT code

In [4]:
from transformers import BertForSequenceClassification, AdamW, BertTokenizer
import torch

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Load the sick training data
train_texts = [text for text, label in train_data]
train_labels = [label for text, label in train_data]

# Tokenize the texts
train_input_ids = []
train_attention_masks = []
for text in train_texts:
    encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=128, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')
    train_input_ids.append(encoded_dict['input_ids'])
    train_attention_masks.append(encoded_dict['attention_mask'])

train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_masks = torch.cat(train_attention_masks, dim=0)
train_labels = torch.tensor(train_labels)

# Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Train the model
model.train()

for epoch in range(3):
    for step in range(len(train_data)):
        input_ids = train_input_ids[step].unsqueeze(0)
        attention_mask = train_attention_masks[step].unsqueeze(0)
        labels = train_labels[step].unsqueeze(0)

        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()

# Load the sick validation data
val_texts = [text for text, label in val_data]
val_labels = [label for text, label in val_data]

# Tokenize the texts
val_input_ids = []
val_attention_masks = []
for text in val_texts:
    encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=128, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')
    val_input_ids.append(encoded_dict['input_ids'])
    val_attention_masks.append(encoded_dict['attention_mask'])

val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)
val_labels = torch.tensor(val_labels)

# Evaluate the model on the validation data
model.eval()

with torch.no_grad():
    outputs = model(val_input_ids, attention_mask=val_attention_masks)
    logits = outputs[0]
    preds = torch.argmax(logits, dim=1)
    acc = torch.sum(preds == val_labels) / len(val_data)

print("Validation accuracy: ", acc.item())


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

NameError: name 'train_data' is not defined

In [2]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the training and validation data
train_texts = ["sick training text 1", "sick training text 2", ...]
train_labels = [0, 1, ...]
val_texts = ["sick validation text 1", "sick validation text 2", ...]
val_labels = [0, 1, ...]

# Tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Convert the tokenized data into PyTorch tensors
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']), torch.tensor(val_encodings['attention_mask']), torch.tensor(val_labels))

# Define the data loader for training and validation
batch_size = 16
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
epochs = 5
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    # Validate the model
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        val_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1).flatten()
        val_preds.extend(preds.cpu().numpy())
        val_labels.extend(labels.cpu().numpy())
    val_f1 = f1_score(val_labels, val_preds)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss/len(train_dataloader)}, Validation Loss: {val_loss/len(val_dataloader)}, Validation F1 Score: {val_f1}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

NameError: name 'train' is not defined

In [17]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1) # Change num_labels to 1 for regression
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the CSV file
data = pd.read_csv(r"C:\Users\Omer\Desktop\sick_d2.csv")

# Split the data into training and validation sets
train, val = train_test_split(data, test_size=0.2, random_state=42)

# Extract sentence_A and sentence_B from the training set
train_texts_A = train['sentence_A'].tolist()
train_texts_B = train['sentence_B'].tolist()
train_scores = train['relatedness_score'].tolist()

# Extract sentence_A and sentence_B from the validation set
val_texts_A = val['sentence_A'].tolist()
val_texts_B = val['sentence_B'].tolist()
val_scores = val['relatedness_score'].tolist()

# Tokenize the texts
train_encodings = tokenizer(train_texts_A, train_texts_B, truncation=True, padding=True)
val_encodings = tokenizer(val_texts_A, val_texts_B, truncation=True, padding=True)

# Convert the tokenized data into PyTorch tensors
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), torch.tensor(train_scores))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']), torch.tensor(val_encodings['attention_mask']), torch.tensor(val_scores))

# Define the data loader for training and validation
batch_size = 16
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_fn = torch.nn.MSELoss()

# Train the model
epochs = 5
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    # Validate the model
    model.eval()    val_loss = 0
    val_preds = []
    val_labels = []
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels.view(-1,1))
        val_loss += loss.item()
        preds = outputs.logits.view(-1).cpu().numpy()
        val_preds.extend(preds)
        val_labels.extend(labels.cpu().numpy())
    val_f1 = f1_score(val_labels, [1 if p>=0.5 else 0 for p in val_preds]) # You can change this to any other evaluation metric that's appropriate for regression tasks
    print("Epoch {} train loss: {:.4f} validation loss: {:.4f} validation F1: {:.4f}".format(epoch+1, train_loss/len(train_dataloader), val_loss/len(val_dataloader), val_f1))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

ValueError: Classification metrics can't handle a mix of continuous and binary targets