In [1]:
!pip install scikit-learn



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
import scipy.sparse
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Config

In [3]:
config_testing = {
    "testdata": "test.csv",            # Input test data file  [MODIFY THIS]
    "model": "LR.pickle",        # Vectorizer pickle file [MODIFY THIS]
    "vectorizer": "TFIDF.pickle",          # Input model file  [MODIFY THIS]
}

# Preprocessing

In [4]:
# Read the CSV file
test_data = pd.read_csv(config_testing['testdata'])

# Preprocessingn
english_stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_and_lemmatize(word):
    if pd.isnull(word):
        return [], ""
    document_words = re.sub('[^a-zA-Z]', ' ', word)  # Remove non-alphabetical characters
    document_words = document_words.lower()          # Case-folding
    document_words = nltk.word_tokenize(document_words)  # Tokenization
    document_words = [lemmatizer.lemmatize(w) for w in document_words if w not in english_stopwords]  # Remove stopwords and lemmatize
    return document_words, ' '.join(document_words)  # Return both list of tokens and the combined string


# Preprocess and update DataFrame for both 'premise' and 'hypothesis' in training and testing data
for df in [test_data]:
    df['premise_processed'] = df['premise'].apply(lambda x: preprocess_and_lemmatize(x)[1])
    df['hypothesis_processed'] = df['hypothesis'].apply(lambda x: preprocess_and_lemmatize(x)[1])

# Convert DataFrame to the format required by your code
test_data_formatted = [test_data['premise'].values, test_data['hypothesis'].values]

# Defining the Model


In [5]:
def TFIDF_features(data, mode):
    list_sentence1 = data[0]
    list_sentence2 = data[1]
    corpus_sentence1 = [' '.join(str(item).split()) for item in list_sentence1]
    corpus_sentence2 = [' '.join(str(item).split()) for item in list_sentence2]

    num_samples = len(list_sentence2)

    corpus = [corpus_sentence1[ind] + " " + corpus_sentence2[ind] for ind in range(num_samples)]

    if mode == "train":
        TFIDF_vect = TfidfVectorizer()
        TFIDF_vect.fit(corpus)
        with open(config_testing['vectorizer'], "wb") as file:
            pickle.dump(TFIDF_vect, file)
    elif mode == "test":
        with open(config_testing['vectorizer'], "rb") as file:
            TFIDF_vect = pickle.load(file)
    else:
        print("Invalid mode selection")
        exit(0)

    tfidf_sentecnce1 = TFIDF_vect.transform(corpus_sentence1)
    tfidf_sentecnce2 = TFIDF_vect.transform(corpus_sentence2)

    tfidf_feature_array = scipy.sparse.hstack((tfidf_sentecnce1, tfidf_sentecnce2))

    return tfidf_feature_array


# Evaluation


In [6]:
test_data_formatted = [test_data['premise'].values, test_data['hypothesis'].values]

In [7]:
from sklearn.metrics import (
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
)
import matplotlib.pyplot as plt
import seaborn as sns


# logistic_regression_test function
def logistic_regression_test(test_data):
    # Obtain the TFIDF features
    test_feature = TFIDF_features(test_data, "test")

    # Load the logistic regression model from the pickle file
    with open(config_testing['model'], "rb") as file:
        LR_model = pickle.load(file)

    # Test the logistic regression model
    pred_labels = LR_model.predict(test_feature)

    predictions_df = pd.DataFrame(pred_labels, columns=['prediction'])

    # Save predictions to a CSV file
    predictions_df.to_csv('./predictions.csv', index=False)
    print("predictions.csv file has been dumped.")

In [8]:
# Evaluate the model on the test data
logistic_regression_test(test_data_formatted)

predictions.csv file has been dumped.
