### original test data

In [45]:
import pandas as pd
language = 'en'
from datasets import load_dataset
dataset = load_dataset('xnli', split='test',language=language)
df_test = pd.DataFrame(dataset)
df_test

Found cached dataset xnli (/home/s6amalia/.cache/huggingface/datasets/xnli/default-language=en/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)


Unnamed: 0,premise,hypothesis,label
0,"Well, I wasn't even thinking about that, but I...",I havent spoken to him again.,2
1,"Well, I wasn't even thinking about that, but I...",I was so upset that I just started talking to ...,0
2,"Well, I wasn't even thinking about that, but I...",We had a great talk.,1
3,"And I thought that was a privilege, and it's s...",I was not aware that I was not the only person...,1
4,"And I thought that was a privilege, and it's s...",I was under the impression that I was the only...,0
...,...,...,...
5005,Davidson should not adopt the pronunciation of...,Davidson shouldn't talk in a way where bone an...,0
5006,Davidson should not adopt the pronunciation of...,It would be better if Davidson rhymed the word...,2
5007,"The average novel of 200,000 words for $25 wor...","A 200,000 word novel at $25 is a fair price.",1
5008,"The average novel of 200,000 words for $25 wor...","A 200,000 word novel for $25 is 4,000 words pe...",2


### Test data with typo

In [46]:
DATA_DIR = r"/home/s6amalia/thesis/"
df_typo = pd.read_table(DATA_DIR+'test_'+language+'_typos_0.05.tsv')
df_typo

Unnamed: 0,premise,hypothesis,label
0,"'Well, i wasn't event thinking about that, but...",'I havent spoke to him again.',2
1,"'Well, I wasn't even thinking about that, but ...",'I was so upset that I just started talkng to ...,0
2,"'Well, I wasn't even thinking Pabout that, but...",'We had an great talk.',1
3,"'And I thought that was a privilege, and it's ...",'i was not aware that I was not the only perso...,1
4,"'And I thought that was a privilege, and it's ...",'I was Under the impression that I is the only...,0
...,...,...,...
5005,'Davidson would not adopt the pronunciation of...,'Davidson shouldn't talk in a way where bone A...,0
5006,'Davidson should not adopt the pronunciation o...,'It would be better if Davison rhymed the word...,2
5007,"'The average novel of 200,000 words for $25 Wo...","'A 200,000 word novel at $25 is of fair price.'",1
5008,"'The average Novel of 200,000 words for $25 wo...","'A 200,000 word novel for $25 id 4,000 words p...",2


### Find original test data corresponding to the typo dataset

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
tfidf_matrix1 = vectorizer.fit_transform(df_typo['hypothesis'])
tfidf_matrix2 = vectorizer.transform(df_test['hypothesis'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix1, tfidf_matrix2)

# Threshold for considering texts as similar; change it to find the best number of data
threshold = 0.81

# Find similar rows
similar_rows = []
for i in range(len(df_typo)):
    for j in range(len(df_test)):
        if cosine_sim[i, j] > threshold:
            similar_rows.append((i, j))
columns = df_test.columns
similar_df = []

for pair in similar_rows:
    similar_df.append(df_test.iloc[pair[1]])
df_test_1 = pd.DataFrame(similar_df)   

print('Found',len(df_test_1),'of', len(df_typo))

Found 682 of 687


### Number of typos in each sentences

In [12]:
import difflib
def typo_count(s1,s2):
    
    # Create a Differ object
    differ = difflib.Differ()

    # Compare the two texts line by line
    diff = list(differ.compare(s1.splitlines(), s2.splitlines()))
    num_changes = sum(1 for line in diff if line.startswith('-') or line.startswith('+'))
    return num_changes

In [16]:
typos = []
for i in range(len(df_test)):
    s2 = df_typo['premise'].iloc[i][1:-1]
    s1 = df_test['premise'].iloc[i]
    typos.append(typo_count(s1,s2))

In [17]:
typo_count(df_test['premise'].iloc[4000],df_typo['premise'].iloc[4000][1:-1])

2

### Predict labels and Accuracy

In [18]:
from sklearn.metrics import accuracy_score
import torch

In [47]:
from transformers import AutoTokenizer,XLMRobertaForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
## loading the fine-tuned model
model = XLMRobertaForSequenceClassification.from_pretrained("/home/s6amalia/xlmroberta-xnli.pt")

In [48]:
def predict_labels(df):
    all_pred = []
    for i in range(len(df)):
        premise = df['premise'].iloc[i]
        hypothesis = df['hypothesis'].iloc[i]
        input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
        output = model(input["input_ids"].to(device))  # device = "cuda:0" or "cpu"
#         print(output)
        prediction = torch.softmax(output["logits"][0], -1).tolist()
        label_names = ["entailment", "neutral", "contradiction"]
        # prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
        all_pred.append(prediction.index(max(prediction)))
        print(int(i*100/len(df)),'%', end='\r')
    print(round( accuracy_score(all_pred, df['label'])*100,2))   
    return all_pred

In [49]:
all_pred = predict_labels(df_test)

43.43


In [50]:
all_pred = predict_labels(df_typo)

42.75
