In [1]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
# stemmer = nltk.SnowballStemmer("english")



def clean_text(text):
    '''
    Make text lowercase, remove text in square brackets, remove links, remove punctuation,
    and remove words containing numbers.
    '''
    text = str(text).lower()
    
    # Remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation
    text = re.sub(rf'[{re.escape(string.punctuation)}]', ' ', text)
    
    # Remove new lines
    text = re.sub(r'\n', ' ', text)
    
    # Remove words containing numbers
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def preprocess_data(text):
    text = clean_text(text)  # Clean punctuation, URLs, and so on
    
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    
    # Lemmatize all the words in the sentence
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    
    return text

# Example usage
example_text = "This is an example text with a URL: http://example.com and an email: example@example.com."
cleaned_text = preprocess_data(example_text)
print(cleaned_text)


example text url email example example com


In [2]:
import pandas as pd
# paths to final test data
final_ag_path = 'C:/Users/marko/OneDrive/바탕 화면/semeval/Task 6 - windows/final_test_data/test.model-agnostic.json'
final_aw_path = 'C:/Users/marko/OneDrive/바탕 화면/semeval/Task 6 - windows/final_test_data/test.model-agnostic.json'

final_ag_df = pd.read_json(final_ag_path, encoding_errors='backslashreplace')
final_aw_df = pd.read_json(final_aw_path, encoding_errors='backslashreplace')
final_ag_df["label"] = final_ag_df["label"].map({"Hallucination":1, "Not Hallucination":0})
final_aw_df["label"] = final_aw_df["label"].map({"Hallucination":1, "Not Hallucination":0})
# perform preprocessing on final_df AND split it based on tasks!
columns = ['hyp','src','tgt'] # exclude task
for x in columns:
    final_ag_df[x] = final_ag_df[x].apply(preprocess_data)
for x in columns:
    final_aw_df[x] = final_aw_df[x].apply(preprocess_data)
final_ag_pg = final_ag_df[final_ag_df['task'] == 'PG']
final_ag_dm = final_ag_df[final_ag_df['task'] == 'DM']
final_ag_mt = final_ag_df[final_ag_df['task'] == 'MT']

final_aw_pg = final_aw_df[final_aw_df['task'] == 'PG']
final_aw_dm = final_aw_df[final_aw_df['task'] == 'DM']
final_aw_mt = final_aw_df[final_aw_df['task'] == 'MT']

final_ag_df

Unnamed: 0,id,src,tgt,hyp,task,labels,label,p(Hallucination)
0,1,ты удивишься если я скажу что на самом деле ме...,would surprised told name actually tom,gonna surprised say real name tom,MT,"[Not Hallucination, Not Hallucination, Not Hal...",0,0.0
1,2,еды будет полно,plenty food,food full,MT,"[Hallucination, Not Hallucination, Hallucinati...",1,0.8
2,3,думаете том будет меня ждать,think tom wait,think tom gonna wait,MT,"[Not Hallucination, Not Hallucination, Not Hal...",0,0.2
3,6,два брата довольно разные,two brother pretty different,lot friend,MT,"[Hallucination, Hallucination, Hallucination, ...",1,1.0
4,7,infradiaphragmatic intra suprasellar craniopha...,medicine diaphragm,anatomy relating diaphragm,DM,"[Hallucination, Hallucination, Hallucination, ...",1,0.8
...,...,...,...,...,...,...,...,...
1495,2992,я никогда не говорил мэри что чувствую,never told mary felt,never told mary feel,MT,"[Hallucination, Not Hallucination, Not Halluci...",0,0.4
1496,2993,beat rat tailed kyoodle run steer eric laid ha...,mutt dog mixed breed little value noisy dog,slang mustang,DM,"[Hallucination, Hallucination, Hallucination, ...",1,1.0
1497,2994,ты знаешь почему они прекратили говорить,know stopped talking,know stopped talking,MT,"[Hallucination, Not Hallucination, Not Halluci...",0,0.4
1498,2996,anyone back,anyone confirm,anyone corroborate,PG,"[Not Hallucination, Not Hallucination, Not Hal...",0,0.0


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
model_name = "stabilityai/stablelm-zephyr-3b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Prepare your dataset
data = final_ag_dm[:2]

# Create prompts and get predictions
for item in data:
    prompt = f"hyp: {item['hyp']} src: {item['src']} tgt: {item['tgt']} Does this hallucinate? Answer with 'hallucinating' or 'not hallucinating'."
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=50)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Prompt: {prompt}")
    print(f"Generated: {generated_text}\n")


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


: 