# Importing files

In [39]:
import pandas as pd

train_df = pd.read_csv('training/train.csv')
dev_df = pd.read_csv('training/dev.csv')

# Text Preprocessing

In [40]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    tokens = word_tokenize(text)
    # Stopword removal
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

train_df['Claim'] = train_df['Claim'].apply(preprocess_text)
train_df['Evidence'] = train_df['Evidence'].apply(preprocess_text)

dev_df['Claim'] = dev_df['Claim'].apply(preprocess_text)
dev_df['Evidence'] = dev_df['Evidence'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\berso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\berso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\berso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Feature Extraction

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_df['Combined_Text'] = train_df['Claim'] + " " + train_df['Evidence']
dev_df['Combined_Text'] = dev_df['Claim'] + " " + dev_df['Evidence']

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 4))
X_train = tfidf.fit_transform(train_df['Combined_Text']).toarray()
X_dev = tfidf.transform(dev_df['Combined_Text']).toarray()

y_train = train_df['label'].values
y_dev = dev_df['label'].values

# Training Model

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

y_dev_pred = model.predict(X_dev)

# Evaluate the model
print("Accuracy:", accuracy_score(y_dev, y_dev_pred))
print("Classification Report:\n", classification_report(y_dev, y_dev_pred))

Accuracy: 0.7996962537968275
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.93      0.87      4286
           1       0.71      0.47      0.56      1640

    accuracy                           0.80      5926
   macro avg       0.76      0.70      0.72      5926
weighted avg       0.79      0.80      0.79      5926



# Notes
* Solution A
Accuracy: 0.7996962537968275
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.93      0.87      4286
           1       0.71      0.47      0.56      1640

    accuracy                           0.80      5926
   macro avg       0.76      0.70      0.72      5926
weighted avg       0.79      0.80      0.79      5926
* Try word-embeddings (from pre-trained models)
* Try solution B