In [1]:
from diffeval import *

In [2]:
train_path = "preprocessed_A1.csv"
test_path = "A2_test_dataset.csv"

In [30]:
from lib2to3.pgen2.pgen import generate_grammar
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
from LanguageModels import *
from preprocess_text import *

def train_and_evaluate(train_sentences, train_labels, test_sentences, test_labels):
    model = make_pipeline(TfidfVectorizer(), MultinomialNB())
    model.fit(train_sentences, train_labels)
    predicted_test_labels = model.predict(test_sentences)
    return accuracy_score(test_labels, predicted_test_labels)

def preprocess_text(text):
    text = lowercase_text(text)
    text = remove_url_html(text)
    text = remove_users(text)
    text = remove_punctuations(text)
    text = remove_whitespaces(text)
    text = tokenization(text)
    text = spelling_correction(text)
    text = remove_alphanum(text)
    return ' '.join(text)

class IntrinsicEvaluation:
    """
    Perform Intrinsic Evaluation on the given generated sentences
    """
    def __init__(self, generated_path):
        self.generated_path = generated_path
        
    def get_perplexities(self):
        with open(self.generated_path, "r") as f:
            lines = f.readlines()
        perplexities = []
        sentences = []
        for line in lines:
            ppl = float(line.split(" ")[-1].strip())
            sentence = line.split(" ")[:-1]
            sentence = ' '.join(sentence)
            perplexities.append(ppl)
            sentences.append(sentence)
        return perplexities, sentences

    def get_avg_perplexity(self, perplexities):
        return sum(perplexities)/len(perplexities)


class ExtrinsicEvaluation:
    """
    Perform Extrinsic Evaluation based on the Dataset type
    """
    def __init__(self, train_path, test_path, addGen, generated_path_pos, generated_path_neg):
        self.train_path = train_path
        self.test_path = test_path
        self.addGen = addGen
        self.generated_path_pos = generated_path_pos
        self.generated_path_neg = generated_path_neg
    
    def build_train(self):
        train_df = pd.read_csv(self.train_path)
        train_sentences = train_df['preprocessed_text']
        for i in range(len(train_sentences)):
            train_sentences[i] = eval(train_sentences[i])
        train_labels = train_df['LABEL']

        for i in range(len(train_sentences)):
            train_sentences[i] = ' '.join(train_sentences[i])

        train_sentences = train_sentences.values
        train_labels = train_labels.values

        if self.addGen == True:
            with open(self.generated_path_pos, "r") as f:
                lines1 = f.readlines()
            with open(self.generated_path_neg, "r") as f:
                lines2 = f.readlines()
            lines = lines1 + lines2
            generated_sentences = []
            for line in lines:
                sentence = line.split(" ")[:-1]
                sentence = ' '.join(sentence)
                generated_sentences.append(sentence)
            vader = SentimentIntensityAnalyzer()
            generated_sentiments = []
            generated_sentiments = [1]*250 + [0]*250
            # for sentence in generated_sentences:
            #     sentiment = vader.polarity_scores(sentence)
            #     if sentiment['compound'] >= 0:
            #         generated_sentiments.append(1)
            #     elif sentiment['compound'] <= 0:
            #         generated_sentiments.append(0)
            # Concatenate the generated sentences with the original training sentences
            train_sentences = np.concatenate((train_sentences, generated_sentences))
            train_labels = np.concatenate((train_labels, generated_sentiments))
        print("Train Sentences: ", len(train_sentences))
        print("Train Labels: ", len(train_labels))
        return train_sentences, train_labels
    
    def build_test(self):
        test_df = pd.read_csv("A2_test_dataset_preprocessed.csv")
        # test_sentences = test_df['TEXT'].apply(preprocess_text)
        test_sentences = test_df['pretext']
        test_labels = test_df['LABEL']
        test_labels = test_labels.values
        return test_sentences.values, test_labels

    def evaluate(self):
        train_sentences, train_labels = self.build_train()
        test_sentences, test_labels = self.build_test()
        return train_and_evaluate(train_sentences, train_labels, test_sentences, test_labels)


In [31]:
extEval = ExtrinsicEvaluation(train_path, test_path, addGen=False, generated_path_pos=None, generated_path_neg=None)

In [32]:
extEval.evaluate()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = eval(train_sentences[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = ' '.join(train_sentences[i])


Train Sentences:  4287
Train Labels:  4287


KeyboardInterrupt: 

In [7]:
a = ExtrinsicEvaluation(train_path, test_path, addGen=True, generated_path_pos=r"generated_sentences\pos_gen_only_ext.txt", generated_path_neg=r"generated_sentences\neg_gen_only_ext.txt")

In [8]:
a.evaluate()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = eval(train_sentences[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = ' '.join(train_sentences[i])


Train Sentences:  4787
Train Labels:  4787


0.906832298136646

In [12]:
b = ExtrinsicEvaluation(train_path, test_path, addGen=True, generated_path_pos=r"generated_sentences\pos_gen_add_numerator.txt", generated_path_neg=r"generated_sentences\neg_gen_add_numerator.txt")

In [13]:
b.evaluate()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = eval(train_sentences[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = ' '.join(train_sentences[i])


Train Sentences:  4787
Train Labels:  4787


0.8990683229813664

In [14]:
c = ExtrinsicEvaluation(train_path, test_path, addGen=True, generated_path_pos=r"generated_sentences\pos_gen_div_denominator.txt", generated_path_neg=r"generated_sentences\neg_gen_div_denominator.txt")

In [15]:
c.evaluate()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = eval(train_sentences[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = ' '.join(train_sentences[i])


Train Sentences:  4787
Train Labels:  4787


0.9006211180124224

In [18]:
d = ExtrinsicEvaluation(train_path, test_path, addGen=True, 
generated_path_pos=r"generated_sentences\pos_gen_mul_numerator.txt", 
generated_path_neg=r"generated_sentences\neg_gen_mul_numerator.txt")

In [19]:
d.evaluate()

Train Sentences:  4787
Train Labels:  4787


0.8944099378881988

In [20]:
e = ExtrinsicEvaluation(train_path, test_path, addGen=True, 
generated_path_pos=r"generated_sentences\pos_hf_prompts.txt", 
generated_path_neg=r"generated_sentences\neg_hf_prompts.txt")

In [21]:
e.evaluate()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = eval(train_sentences[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = ' '.join(train_sentences[i])


Train Sentences:  4787
Train Labels:  4787


0.8881987577639752

In [23]:
f = ExtrinsicEvaluation(train_path, test_path, addGen=True, 
generated_path_pos=r"generated_sentences\pos_ppl_normalized.txt", 
generated_path_neg=r"generated_sentences\neg_ppl_normalized.txt")

In [24]:
f.evaluate()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = eval(train_sentences[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = ' '.join(train_sentences[i])


Train Sentences:  4787
Train Labels:  4787


0.8819875776397516

In [25]:
g = ExtrinsicEvaluation(train_path, test_path, addGen=True, 
generated_path_pos=r"generated_sentences\pos_vader_textblob.txt", 
generated_path_neg=r"generated_sentences\neg_vader_textblob.txt")

In [26]:
g.evaluate()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = eval(train_sentences[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sentences[i] = ' '.join(train_sentences[i])


Train Sentences:  4787
Train Labels:  4787


0.889751552795031