# Importing files

In [17]:
import pandas as pd

train_df = pd.read_csv('training/train.csv')
dev_df = pd.read_csv('training/dev.csv')

# Text Preprocessing

In [18]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    tokens = word_tokenize(text)
    # Stopword removal
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

train_df['Claim'] = train_df['Claim'].apply(preprocess_text)
train_df['Evidence'] = train_df['Evidence'].apply(preprocess_text)

dev_df['Claim'] = dev_df['Claim'].apply(preprocess_text)
dev_df['Evidence'] = dev_df['Evidence'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\berso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\berso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\berso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Feature Extraction

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_df['Combined_Text'] = train_df['Claim'] + " " + train_df['Evidence']
dev_df['Combined_Text'] = dev_df['Claim'] + " " + dev_df['Evidence']

tfidf = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features
X_train = tfidf.fit_transform(train_df['Combined_Text']).toarray()
X_dev = tfidf.transform(dev_df['Combined_Text']).toarray()

y_train = train_df['label'].values
y_dev = dev_df['label'].values

# Training Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

y_dev_pred = model.predict(X_dev)

# Evaluate the model
print("Accuracy:", accuracy_score(y_dev, y_dev_pred))
print("Classification Report:\n", classification_report(y_dev, y_dev_pred))

Accuracy on Dev Set: 0.7934525818427269
Classification Report on Dev Set:
               precision    recall  f1-score   support

           0       0.82      0.92      0.87      4286
           1       0.69      0.45      0.55      1640

    accuracy                           0.79      5926
   macro avg       0.75      0.69      0.71      5926
weighted avg       0.78      0.79      0.78      5926

