In [None]:
%pip install pandas
%pip install nltk
%pip install scikit-learn

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from joblib import load
import re
import pandas as pd
import nltk


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


PUNCTUATION_PATTERN = re.compile(r'[^\w\s]')
STOPWORDS = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = PUNCTUATION_PATTERN.sub('', text) # Remove punctuation
    tokens = word_tokenize(text)
    # Stopword removal
    tokens = [word for word in tokens if word not in STOPWORDS]
    return ' '.join(tokens)

def write_predictions(input_csv_path, tfidf, model, output_csv_path='predictions.csv'):
    test_df = pd.read_csv(input_csv_path)
    
    test_df['Claim'] = test_df['Claim'].apply(preprocess_text)
    test_df['Evidence'] = test_df['Evidence'].apply(preprocess_text)
    
    test_df['Combined_Text'] = test_df['Claim'] + " " + test_df['Evidence']
    
    X_test = tfidf.transform(test_df['Combined_Text']).toarray()
    
    predictions = model.predict(X_test)
    

    pd.DataFrame(predictions, columns=['prediction']).to_csv(output_csv_path, index=False)

loaded_pipeline = load("best_model_pipeline.joblib")
best_tfidf = loaded_pipeline[0]
best_model = loaded_pipeline[1]

write_predictions('data/test.csv', best_tfidf, best_model)