# Importing files

In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')



# Text Preprocessing

In [None]:
train_df = pd.read_csv('data/train.csv')
dev_df = pd.read_csv('data/dev.csv')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    tokens = word_tokenize(text)
    # Stopword removal
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

train_df['Claim'] = train_df['Claim'].apply(preprocess_text)
train_df['Evidence'] = train_df['Evidence'].apply(preprocess_text)

dev_df['Claim'] = dev_df['Claim'].apply(preprocess_text)
dev_df['Evidence'] = dev_df['Evidence'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\berso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\berso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\berso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Feature Extraction

In [None]:
train_df['Combined_Text'] = train_df['Claim'] + " " + train_df['Evidence']
dev_df['Combined_Text'] = dev_df['Claim'] + " " + dev_df['Evidence']

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 4))
X_train = tfidf.fit_transform(train_df['Combined_Text']).toarray()
X_dev = tfidf.transform(dev_df['Combined_Text']).toarray()

y_train = train_df['label'].values
y_dev = dev_df['label'].values

# Training Model

In [None]:
# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

y_dev_pred = model.predict(X_dev)

# Evaluate the model
print("Accuracy:", accuracy_score(y_dev, y_dev_pred))
print("Classification Report:\n", classification_report(y_dev, y_dev_pred))

Accuracy: 0.7996962537968275
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.93      0.87      4286
           1       0.71      0.47      0.56      1640

    accuracy                           0.80      5926
   macro avg       0.76      0.70      0.72      5926
weighted avg       0.79      0.80      0.79      5926



# Write Predictions

In [None]:
def write_predictions(input_csv_path, output_csv_path='predictions.csv'):
    """
    Reads a CSV file with "Claim" and "Evidence" columns, preprocesses the text,
    makes predictions using the trained model, and writes predictions to a file.
    
    Args:
        input_csv_path (str): Path to the input CSV file
        output_csv_path (str): Path to save the predictions (default: 'predictions.csv')
    """
    # Read the input CSV file
    test_df = pd.read_csv(input_csv_path)
    
    # Preprocess the text
    test_df['Claim'] = test_df['Claim'].apply(preprocess_text)
    test_df['Evidence'] = test_df['Evidence'].apply(preprocess_text)
    
    # Combine the text
    test_df['Combined_Text'] = test_df['Claim'] + " " + test_df['Evidence']
    
    # Transform using the trained TF-IDF vectorizer
    X_test = tfidf.transform(test_df['Combined_Text']).toarray()
    
    # Make predictions
    predictions = model.predict(X_test)
    
    # Save predictions to file
    pd.DataFrame(predictions, columns=['prediction']).to_csv(output_csv_path, index=False)

write_predictions('data/test.csv')