# Importing files

In [10]:
%pip install pandas
%pip install nltk
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import nltk
from joblib import dump

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\berso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\berso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\berso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Text Preprocessing

In [None]:
PUNCTUATION_PATTERN = re.compile(r'[^\w\s]')
STOPWORDS = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower() # Convert text to lowercase
    text = PUNCTUATION_PATTERN.sub('', text) # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in STOPWORDS] # Stopword removal
    return ' '.join(tokens)

def prepare_data(df):
    df = df.copy()
    df['Claim'] = df['Claim'].apply(preprocess_text)
    df['Evidence'] = df['Evidence'].apply(preprocess_text)
    df['Combined_Text'] = df['Claim'] + " " + df['Evidence']
    return df

train_df = prepare_data(pd.read_csv('data/train.csv'))
dev_df = prepare_data(pd.read_csv('data/dev.csv'))

# Hyperparameter Selection

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', LogisticRegression())
])

# Define the parameter grid to search
param_grid = [
    {
        'tfidf__max_features': [5000, 10000, 15000],
        'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
        'tfidf__min_df': [2, 3, 5],
        'tfidf__max_df': [0.75, 0.85, 0.95],
        'model': [LogisticRegression()],
        'model__solver': ['lbfgs', 'liblinear']
    }
]

# Create GridSearchCV object
grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=3, 
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Prepare the data
train_text = train_df['Combined_Text'].values
y_train = train_df['label'].values
dev_text = dev_df['Combined_Text'].values
y_dev = dev_df['label'].values

# Perform the grid search
grid_search.fit(train_text, y_train)

# Evaluate on development set
y_dev_pred = grid_search.predict(dev_text)

# Print results
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
print("\nDevelopment set results:")
print("Accuracy:", accuracy_score(y_dev, y_dev_pred))
print("Classification Report:\n", classification_report(y_dev, y_dev_pred))

# Save the best model and vectorizer
best_model_pipeline = grid_search.best_estimator_
dump(best_model_pipeline, 'best_model_pipeline.joblib')

Best parameters: {'model': LogisticRegression(), 'model__solver': 'lbfgs', 'tfidf__max_df': 0.75, 'tfidf__max_features': 15000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 4)}
Best cross-validation score: 0.80

Development set results:
Accuracy: 0.8002024974687817
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.93      0.87      4286
           1       0.72      0.46      0.56      1640

    accuracy                           0.80      5926
   macro avg       0.77      0.69      0.72      5926
weighted avg       0.79      0.80      0.78      5926



['best_model_pipeline.joblib']

# Write Predictions Using Best Configuration

In [None]:
def write_predictions(input_csv_path, tfidf, model, output_csv_path='predictions.csv'):
    test_df = pd.read_csv(input_csv_path)
    
    test_df['Claim'] = test_df['Claim'].apply(preprocess_text)
    test_df['Evidence'] = test_df['Evidence'].apply(preprocess_text)
    
    test_df['Combined_Text'] = test_df['Claim'] + " " + test_df['Evidence']
    
    X_test = tfidf.transform(test_df['Combined_Text']).toarray()
    
    predictions = model.predict(X_test)
    
    pd.DataFrame(predictions, columns=['prediction']).to_csv(output_csv_path, index=False)

write_predictions("data/test.csv", best_model_pipeline[0], best_model_pipeline[1])