In [4]:
from FileTransformer import read_and_process_file
from bert_score_eval import get_bertscore
from perpexlity_score import perplexity_score
from absa import Absa

In [5]:
# Example usage:
file_path = 'response1.txt'
df=read_and_process_file(file_path)

In [6]:
def calculate_pairwise_bert_score(df):
    # Create new columns to store similarity scores
    df['Precision'] = 0.0
    df['Recall'] = 0.0
    df['F1_Score'] = 0.0

    # Iterate through each pair of adjacent dialogues to calculate and store similarity scores
    for i in range(1, len(df)):  # Iterate up to the second-to-last row
        candidate_dialogue = df.at[i, 'Dialogue']
        reference_dialogue = df.at[i -1, 'Dialogue']

        # Calculate similarity scores
        precision, recall, f1_score = get_bertscore(candidate_dialogue, reference_dialogue)
        #print(precision,recall,f1_score)

        # Store the scores in the DataFrame
        df.loc[df.index[i], 'Precision'] = precision
        df.loc[df.index[i], 'Recall'] = recall
        df.loc[df.index[i], 'F1_Score'] = f1_score
    return df

In [7]:
df=calculate_pairwise_bert_score(df)



In [8]:
def calculate_perplexity_score(df):
    px=perplexity_score()
    df['Perplexity']=df['Dialogue'].apply(px.calculate)
    return df

In [9]:
df=calculate_perplexity_score(df)

In [10]:
absa=Absa(df)
new_df=absa.get_absa()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




   Topic  Count
1      0     14
0      1      8


In [11]:
new_df.head(22)

Unnamed: 0,Character,Dialogue,Precision,Recall,F1_Score,Perplexity,sentiment_scores
0,Prompt,"The free market system, competitive capitalism...",0.0,0.0,0.0,41.121056,"{'free': ('Positive', 0.9143444299697876), 'ma..."
1,Agent1,The evidence suggests that economic growth and...,0.536521,0.659616,0.591735,27.718516,"{'free': ('Negative', 0.8943428993225098), 'ma..."
2,Agent2,While I understand the left leaning lawyer's c...,0.653876,0.637845,0.645761,11.941216,"{'free': ('Positive', 0.9617961645126343), 'ma..."
3,Agent1,As a lawyer representing a left leaning non pr...,0.665992,0.72777,0.695512,21.670938,"{'free': ('Negative', 0.7954026460647583), 'ma..."
4,Agent2,As a conservative lawyer representing a right ...,0.750525,0.706991,0.728108,22.454166,"{'free': ('Positive', 0.8486932516098022), 'ma..."
5,Agent1,I strongly disagree with my conservative count...,0.775066,0.758312,0.766597,20.882995,"{'free': ('Positive', 0.6086866855621338), 'ma..."
6,Agent2,I strongly disagree with my liberal counterpar...,0.761171,0.799138,0.779693,19.666692,"{'free': ('Positive', 0.9319241642951965), 'ma..."
7,Agent1,I understand my conservative counterpart's poi...,0.733193,0.728131,0.730653,20.772899,"{'free': ('Negative', 0.5215399861335754), 'ma..."
8,Agent2,I respectfully disagree with my counterpart's ...,0.71535,0.72516,0.720221,26.349162,"{'government': ('Negative', 0.6700146198272705..."
9,Agent1,"I understand my counterpart's perspective, but...",0.681136,0.699712,0.690299,21.200003,"{'government': ('Negative', 0.5022582411766052..."


In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class TextFileTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = None

    def fit(self, X, y=None):
        # The fit method is typically used for parameter tuning in transformers.
        return self

    def transform(self, X):
        print("1. Reading File into Dataframe")
        self.data=self.read_file()
        print("2. Calculating pairwise Bert Metrics")
        self.data=self.calculate_pairwise_bert_score()
        print("3. Calculating Perplexity Score Metrics")
        self.data=self.calculate_perplexity_score()
        print("4. Calculating aspect based Sentiments Metrics")
        absa=Absa(self.data)
        self.data=absa.get_absa()
        return self.data

    def read_file(self):
        """
        Input: File with conversations separated by #####
        Output : Dataframe with character and dialogue    
        """
        try:
            with open(self.file_path, 'r') as file:
                # Read the entire file into a string
                file_content = file.read()

                # Separate the file into chunks using the keyword "#####"
                separated_chunks = file_content.split('#####')

                # Remove whitespaces and new line characters from each element
                cleaned_chunks = [chunk.strip() for chunk in separated_chunks]

                # Create a list to store character names and dialogues
                data = []

                # Iterate through the cleaned chunks to extract character names and dialogues
                for i in range(0, len(cleaned_chunks), 2):
                    character_name = cleaned_chunks[i]
                    dialogue = cleaned_chunks[i + 1] if i + 1 < len(cleaned_chunks) else ""  # Handle odd-length chunks

                    data.append({'Character': character_name, 'Dialogue': dialogue})

                # Convert the list of dictionaries to a DataFrame
                df = pd.DataFrame(data)
                return df

        except FileNotFoundError:
            print(f"Error: File '{file_path}' not found.")
            return None
        except Exception as e:
            print(f"Error: {e}")
            return None
    
    def calculate_perplexity_score(self):
        px=perplexity_score()
        self.data['Perplexity']=self.data['Dialogue'].apply(px.calculate)
        return self.data
    
    def calculate_pairwise_bert_score(self):
        # Create new columns to store similarity scores
        self.data['Precision'] = 0.0
        self.data['Recall'] = 0.0
        self.data['F1_Score'] = 0.0

        # Iterate through each pair of adjacent dialogues to calculate and store similarity scores
        for i in range(1, len(self.data)):  # Iterate up to the second-to-last row
            candidate_dialogue = self.data.at[i, 'Dialogue']
            reference_dialogue = self.data.at[i -1, 'Dialogue']

            # Calculate similarity scores
            precision, recall, f1_score = get_bertscore(candidate_dialogue, reference_dialogue)
            #print(precision,recall,f1_score)

            # Store the scores in the DataFrame
            self.data.loc[self.data.index[i], 'Precision'] = precision
            self.data.loc[self.data.index[i], 'Recall'] = recall
            self.data.loc[self.data.index[i], 'F1_Score'] = f1_score
        return self.data

# # Example usage:
# file_path = 'your_file.txt'  # Replace with the actual file path
# text_transformer = TextFileTransformer(file_path)

# # Transform the file
# df_result = text_transformer.transform(None)

# # Display the resulting DataFrame
# print(df_result)

In [13]:
file_path = 'Data/response1.txt'  # Replace with the actual file path
text_transformer = TextFileTransformer(file_path)

# Transform the file
df_result = text_transformer.transform(None)




   Topic  Count
0      0     14
1      1      8
