In [None]:
!pip install stanza

In [None]:
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import stanza

stanza.download('en')  # run once
nlp = stanza.Pipeline(
    lang='en',
    processors='tokenize,pos,lemma,depparse,ner,constituency,coref'
)

In [None]:
class AnalyzerBase:
    """Base class for text analysis modules. Handles file loading and provides a template interface.

    This class loads a dataset from either a .csv or .xlsx file into a pandas DataFrame.
    Subclasses must implement the `run()` method to define specific analysis logic.

    Attributes:
        df (pd.DataFrame): The loaded DataFrame with missing values replaced by empty strings.
    """
    def __init__(self, filepath):
        """Initializes the AnalyzerBase by loading a CSV or Excel file.

        Args:
            filepath (str): Path to the input file. Must be either a `.csv` or `.xlsx`.

        Raises:
            ValueError: If the file format is not supported.
        """
        if filepath.endswith(".xlsx"):
            self.df = pd.read_excel(filepath)
            print(self.df)
        elif filepath.endswith(".csv"):
            self.df = pd.read_csv(filepath)
        else:
            raise ValueError("Unsupported file format. Please provide a .csv or .xlsx file.")

        self.df.fillna('', inplace=True)

    def run(self):
        """Abstract method to be implemented by all subclasses.

        This method defines the core logic of the analysis task.

        Raises:
            NotImplementedError: Always, unless overridden in a child class.
        """
        raise NotImplementedError("Subclasses must implement this method")


class EntailmentAnalyzer(AnalyzerBase):
    """Performs semantic entailment analysis between a sentence and its context.

    This class uses a zero-shot classification model (BART large MNLI) to measure
    how well the sentence is semantically entailed by the preceding and following story parts.

    Attributes:
        classifier: A HuggingFace pipeline for zero-shot classification.
    """
    def __init__(self, filepath):
        """Initializes the entailment analyzer and loads the dataset and model.

        Args:
            filepath (str): Path to the input CSV or XLSX file.
        """
        super().__init__(filepath)
        self.classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

    def get_entailment_score(self, context, hypothesis):
        """Calculates semantic entailment score between a hypothesis and a context.

        Uses a zero-shot classification model to predict how well the hypothesis
        (i.e., sentence) is entailed by the context (e.g., Pre_Story or Post_Story).

        Args:
            context (str): The context to test against (as the candidate label).
            hypothesis (str): The sentence to evaluate (as the hypothesis).

        Returns:
            float: Entailment score between 0.0 and 1.0. Returns 0.0 for empty inputs.
        """
        if not context.strip() or not hypothesis.strip():
            return 0.0
        result = self.classifier(hypothesis, candidate_labels=[context], hypothesis_template="This sentence is about {}.")
        return result['scores'][0]

    def run(self):
        """Runs the full entailment analysis on the loaded dataset.

        For each row:
        - Computes entailment score between 'Sentence' and 'Pre_Story' → `Pre_ent_score`
        - Computes entailment score between 'Sentence' and 'Post_Story' → `Post_ent_score`

        It also:
        - Divides rows into bins: 'Bin 1' for the first 10, then each 10 rows → Bin 2, 3, ...
        - Adds a row at the end of each bin showing the average entailment scores.
        - Inserts an empty row between bins for readability.

        Outputs:
            CSV file: 'big_story_entailment.csv'
        """
        tqdm.pandas()

        # Compute entailment scores
        self.df['Pre_ent_score'] = self.df.progress_apply(
            lambda row: self.get_entailment_score(str(row['Sentence']), str(row['Pre_Story'])), axis=1)
        self.df['Post_ent_score'] = self.df.progress_apply(
            lambda row: self.get_entailment_score(str(row['Sentence']), str(row['Post_Story'])), axis=1)

        # Assign Bin labels
        bin_labels = []
        for i in range(len(self.df)):
            if i < 10:
                bin_labels.append("Bin 1")
            else:
                bin_num = ((i - 10) // 10) + 2
                bin_labels.append(f"Bin {bin_num}")
        self.df['Bin'] = bin_labels

        # Insert empty rows and average rows between bins
        final_rows = []
        for bin_name, group in self.df.groupby('Bin'):
            # Add the group's rows
            for _, row in group.iterrows():
                final_rows.append(row)

            # Add average row
            avg_row = pd.Series({
                'Pre_ent_score': group['Pre_ent_score'].mean(),
                'Post_ent_score': group['Post_ent_score'].mean(),
                'Bin': bin_name,
                'Sentence': '[Average]',  # optional marker
            })
            final_rows.append(avg_row)

            # Add blank row
            final_rows.append(pd.Series(dtype=object))

        # Create final DataFrame
        final_df = pd.DataFrame(final_rows)

        # Save to CSV
        final_df.to_csv('big_story_entailment.csv', index=False)
        print("✅ Entailment scores saved to 'big_story_entailment.csv'")


class CoherenceAnalyzer(AnalyzerBase):
    """Performs coherence analysis by computing sentence similarity scores using a sentence transformer model.

    This analyzer compares the central sentence against its surrounding context (Pre_Story and Post_Story)
    using cosine similarity between sentence embeddings.
    """
    def __init__(self, filepath):
        """Initializes the CoherenceAnalyzer with a transformer model and loads input data.

        Args:
            filepath (str): Path to the input CSV or XLSX file.
        """
        super().__init__(filepath)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def compute_similarity(self, base, other):
        """Computes cosine similarity between two text segments using sentence embeddings.

        Args:
            base (str): The base sentence to compare from.
            other (str): The sentence to compare to.

        Returns:
            float: Cosine similarity score ranging between -1 and 1.
        """
        embeddings = self.model.encode([base, other], convert_to_tensor=True)
        return util.cos_sim(embeddings[0], embeddings[1]).item()

    def run(self):
        """Executes the coherence analysis workflow.

        For each row in the dataset:
        - Computes similarity between 'Sentence' and 'Pre_Story' → `Pre_similarity`
        - Computes similarity between 'Sentence' and 'Post_Story' → `Post_similarity`

        Then:
        - Assigns each row to a bin: 'Bin 1' contains first 10 rows, each subsequent bin contains 10 rows
        - Computes and inserts an average row for each bin with mean similarity scores
        - Inserts a blank row between bins for formatting

        Output:
            CSV file: 'big_story_coherence_scores.csv' with added similarity scores and bin averages
        """
        tqdm.pandas()

        # Calculate similarity scores
        self.df['Pre_similarity'] = self.df.progress_apply(
            lambda row: self.compute_similarity(str(row['Sentence']), str(row['Pre_Story'])), axis=1)
        self.df['Post_similarity'] = self.df.progress_apply(
            lambda row: self.compute_similarity(str(row['Sentence']), str(row['Post_Story'])), axis=1)

        # Assign Bin labels
        bin_labels = []
        for i in range(len(self.df)):
            if i < 10:
                bin_labels.append("Bin 1")
            else:
                bin_num = ((i - 10) // 10) + 2
                bin_labels.append(f"Bin {bin_num}")
        self.df['Bin'] = bin_labels

        # Insert rows: original, per-bin average, and blank between bins
        final_rows = []
        for bin_name, group in self.df.groupby('Bin'):
            for _, row in group.iterrows():
                final_rows.append(row)

            # Add average row for the bin
            avg_row = pd.Series({
                'Pre_similarity': group['Pre_similarity'].mean(),
                'Post_similarity': group['Post_similarity'].mean(),
                'Bin': bin_name,
                'Sentence': '[Average]'  # optional label
            })
            final_rows.append(avg_row)

            # Add empty row
            final_rows.append(pd.Series(dtype=object))

        # Create final DataFrame
        final_df = pd.DataFrame(final_rows)

        # Save to CSV
        final_df.to_csv('big_story_coherence_scores.csv', index=False)
        print("✅ Coherence scores saved to 'big_story_coherence_scores.csv'")


class AverageCoherenceAnalyzer(AnalyzerBase):
    """Analyzes sentence-level coherence across full story contexts.

    Calculates the average semantic similarity between all consecutive sentence pairs
    within a composite story: Pre_Story + Sentence + Post_Story.

    Attributes:
        model: A SentenceTransformer model used to compute sentence embeddings.
    """

    def __init__(self, filepath):
        """Initializes the analyzer and loads the sentence transformer model.

        Args:
            filepath (str): Path to the input CSV or XLSX file.
        """
        super().__init__(filepath)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def compute_similarity(self, s1, s2):
        """Computes cosine similarity between two text segments.

        Args:
            s1 (str): The first sentence.
            s2 (str): The second sentence.

        Returns:
            float: Cosine similarity between sentence embeddings.
        """
        embeddings = self.model.encode([s1, s2], convert_to_tensor=True)
        return util.cos_sim(embeddings[0], embeddings[1]).item()

    def calculate_average_coherence(self, text):
        """Calculates the average coherence of a multi-sentence text.

        Args:
            text (str): A block of text formed by joining Pre_Story, Sentence, and Post_Story.

        Returns:
            float: The average similarity between all adjacent sentence pairs.
        """
        sentences = [s.strip() for s in text.split('.') if s.strip()]
        if len(sentences) < 2:
            return 0
        scores = [self.compute_similarity(sentences[i], sentences[i+1])
                  for i in range(len(sentences)-1)]
        return sum(scores) / len(scores)

    def run(self):
        """Executes the average coherence analysis for the full story segment.

        Steps:
        - Combines Pre_Story, Sentence, and Post_Story into a unified text block.
        - Computes average coherence for each row.
        - Organizes rows into bins:
            - Bin 1 for the first 10 rows
            - Bin 2, 3, ... for every subsequent group of 10 rows
        - Appends an average row per bin with the mean 'Average_Coherence'
        - Inserts an empty row after each bin for readability

        Output:
            CSV file: 'full_story_average_coherence_scores.csv'
        """
        tqdm.pandas()

        # Calculate average coherence
        self.df['Average_Coherence'] = self.df.progress_apply(
            lambda row: self.calculate_average_coherence(
                str(row['Pre_Story']) + " " + str(row['Sentence']) + " " + str(row['Post_Story'])), axis=1)

        # Assign Bin labels
        bin_labels = []
        for i in range(len(self.df)):
            if i < 10:
                bin_labels.append("Bin 1")
            else:
                bin_num = ((i - 10) // 10) + 2
                bin_labels.append(f"Bin {bin_num}")
        self.df['Bin'] = bin_labels

        # Insert original rows, per-bin average row, and empty line
        final_rows = []
        for bin_name, group in self.df.groupby('Bin'):
            for _, row in group.iterrows():
                final_rows.append(row)

            # Add average row for this bin
            avg_row = pd.Series({
                'Average_Coherence': group['Average_Coherence'].mean(),
                'Bin': bin_name,
                'Sentence': '[Average]'
            })
            final_rows.append(avg_row)

            # Add empty row
            final_rows.append(pd.Series(dtype=object))

        # Create final DataFrame
        final_df = pd.DataFrame(final_rows)

        # Save to CSV
        final_df.to_csv('full_story_average_coherence_scores.csv', index=False)
        print("✅ Average coherence scores saved to 'full_story_average_coherence_scores.csv'")


class CoreferenceAnalyzer(AnalyzerBase):
    """Analyzes coreference chains in a text to determine whether a target phrase is referenced
    in the surrounding context (pre and post story).

    Uses a coreference resolution NLP model (e.g., Stanza or AllenNLP) to track mentions of the
    target phrase across the full story context.
    """
    def __init__(self, filepath):
        """Initializes the coreference analyzer with a file and NLP model.

        Args:
            filepath (str): Path to the CSV or Excel file containing story data.
        """
        super().__init__(filepath)
        self.nlp = nlp

    def get_region(self, offset, pre_length, main_start, main_length):
        """Determines whether a character offset falls in the pre, main, or post region.

        Args:
            offset (int): The character offset of a mention.
            pre_length (int): Length of the pre-story text.
            main_start (int): Starting character index of the main sentence.
            main_length (int): Length of the main sentence.

        Returns:
            str: One of 'pre', 'main', or 'post'.
        """
        if offset < pre_length:
            return 'pre'
        elif main_start <= offset < main_start + main_length:
            return 'main'
        else:
            return 'post'

    def check_phrase_coref_in_pre_post(self, pre, main, post, target_phrase):
        """Checks coreference chains for mentions of the target phrase across pre, main, and post.

        Args:
            pre (str): Preceding story context.
            main (str): The main sentence where the target phrase is located.
            post (str): Following story context.
            target_phrase (str): The phrase to track through coreference.

        Returns:
            dict: A dictionary with counts and flags for where and how the phrase is mentioned,
                  including mention counts in pre/main/post, whether referenced, and actual phrases.
        """
        target_phrase = target_phrase.strip()
        if not target_phrase:
            return {
                'mention_count_pre': 0,
                'mention_count_main': 0,
                'mention_count_post': 0,
                'coref_chain_length': 0,
                'target_phrase_mention_count': 0,
                'phrase_in_chain_pre': False,
                'phrase_in_chain_post': False,
                'referenced_elsewhere': False,
                'coref_mentions_pre': [],
                'coref_mentions_post': []
            }

        combined_text = pre.strip() + " " + main.strip() + " " + post.strip()
        pre_length = len(pre.strip())
        main_start = pre_length + 1
        main_length = len(main.strip())

        doc = self.nlp(combined_text)
        if not hasattr(doc, "coref") or not doc.coref:
            return {
                'mention_count_pre': 0,
                'mention_count_main': 0,
                'mention_count_post': 0,
                'coref_chain_length': 0,
                'target_phrase_mention_count': 0,
                'phrase_in_chain_pre': False,
                'phrase_in_chain_post': False,
                'referenced_elsewhere': False,
                'coref_mentions_pre': [],
                'coref_mentions_post': []
            }

        target_chain_ids = set()
        target_phrase_mention_count = 0

        for chain_id, chain in enumerate(doc.coref):

            for mention in chain.mentions:

                sent_index = mention.sentence
                start_word_idx = mention.start_word
                end_word_idx = mention.end_word
                words = doc.sentences[sent_index].words[start_word_idx:end_word_idx]
                if not words:
                    continue
                mention_text = " ".join([w.text for w in words]).strip()

                if mention_text.lower() == target_phrase.lower():

                    target_chain_ids.add(chain_id)
                    target_phrase_mention_count += 1


        if not target_chain_ids:
            return {
                'mention_count_pre': 0,
                'mention_count_main': 0,
                'mention_count_post': 0,
                'coref_chain_length': 0,
                'target_phrase_mention_count': 0,
                'phrase_in_chain_pre': False,
                'phrase_in_chain_post': False,
                'referenced_elsewhere': False,
                'coref_mentions_pre': [],
                'coref_mentions_post': []
            }

        count_pre = count_main = count_post = 0
        coref_mentions_pre = set()
        coref_mentions_post = set()

        for chain_id in target_chain_ids:
            for mention in doc.coref[chain_id].mentions:
                sent_index = mention.sentence
                start_word_idx = mention.start_word
                end_word_idx = mention.end_word
                words = doc.sentences[sent_index].words[start_word_idx:end_word_idx]
                if not words:
                    continue
                mention_text = " ".join([w.text for w in words]).strip()
                mention_start_char = words[0].start_char

                region = self.get_region(
                    offset=mention_start_char,
                    pre_length=pre_length,
                    main_start=main_start,
                    main_length=main_length
                )

                if region == 'pre':
                    count_pre += 1
                    coref_mentions_pre.add(mention_text)
                elif region == 'main':
                    count_main += 1
                elif region == 'post':
                    count_post += 1
                    coref_mentions_post.add(mention_text)

        total_mentions = count_pre + count_main + count_post

        return {
            'mention_count_pre': count_pre,
            'mention_count_main': count_main,
            'mention_count_post': count_post,
            'coref_chain_length': total_mentions,
            'target_phrase_mention_count': target_phrase_mention_count,
            'phrase_in_chain_pre': count_pre > 0,
            'phrase_in_chain_post': count_post > 0,
            'referenced_elsewhere': (count_pre > 0 or count_post > 0),
            'coref_mentions_pre': list(coref_mentions_pre),
            'coref_mentions_post': list(coref_mentions_post)
        }


    def run(self):
        """Runs coreference resolution on each row and aggregates results with bin-wise averages.

        For each input row:
        - Extracts the relevant text regions and target phrase.
        - Runs coreference resolution and determines where references occur.
        - Calculates mention counts in pre/main/post, total mentions, and direct matches.

        Then:
        - Bins the rows (Bin 1 = first 10, others = every 10 rows)
        - Appends a summary row to each bin with averages of numeric columns
        - Inserts an empty row after each bin for readability

        Output:
            CSV file: 'coref_chain_counts_with_flags.csv'
        """
        tqdm.pandas()

        results_df = self.df.progress_apply(
            lambda row: pd.Series(self.check_phrase_coref_in_pre_post(
                row['Pre_Story'],
                row['Sentence'],
                row['Post_Story'],
                row['target_phrase']
            )), axis=1
        )

        # Merge results
        self.df = pd.concat([self.df, results_df], axis=1)

        # Assign Bin numbers
        bin_labels = []
        for i in range(len(self.df)):
            if i < 10:
                bin_labels.append("Bin 1")
            else:
                bin_num = ((i - 10) // 10) + 2
                bin_labels.append(f"Bin {bin_num}")
        self.df["Bin"] = bin_labels

        # Insert original rows, then average rows, then blank rows
        final_rows = []
        for bin_name, group in self.df.groupby("Bin"):
            for _, row in group.iterrows():
                final_rows.append(row)

            # Calculate averages for numeric columns
            avg_row = pd.Series({
                "mention_count_pre": group["mention_count_pre"].mean(),
                "mention_count_main": group["mention_count_main"].mean(),
                "mention_count_post": group["mention_count_post"].mean(),
                "coref_chain_length": group["coref_chain_length"].mean(),
                "target_phrase_mention_count": group["target_phrase_mention_count"].mean(),
                "Bin": bin_name,
                "Sentence": "[Average]"  # Optional marker
            })
            final_rows.append(avg_row)

            # Empty row for spacing
            final_rows.append(pd.Series(dtype=object))

        # Final DataFrame
        final_df = pd.DataFrame(final_rows)

        # Save to CSV
        final_df.to_csv("coref_chain_counts_with_flags.csv", index=False)
        print("✅ Coreference results saved to 'coref_chain_counts_with_flags.csv'")

# Entry Point
def main():
    """Main entry point for running one of the available text analysis modules.

    Prompts the user to:
    - Select an analysis type (Entailment, Coherence, Average Coherence, Coreference)
    - Provide the path to an input CSV or Excel file

    Based on the user's selection, the corresponding analyzer is initialized and executed.

    Options:
        1. Entailment Score
        2. Coherence Score
        3. Average Coherence Between Sentences
        4. Coreference Resolution

    Raises:
        ValueError: If the user's choice is not among the allowed options (1–4).
    """
    print("Select an analysis to run:")
    print("1. Entailment Score")
    print("2. Coherence Score")
    print("3. Average Coherence Between Sentences")
    print("4. Coreference Resolution")

    choice = input("Enter choice (1-4): ")
    filepath = input("Enter path to CSV file: ")

    analyzers = {
        '1': EntailmentAnalyzer,
        '2': CoherenceAnalyzer,
        '3': AverageCoherenceAnalyzer,
        '4': CoreferenceAnalyzer
    }

    analyzer_class = analyzers.get(choice)
    if analyzer_class:
        analyzer = analyzer_class(filepath)
        analyzer.run()
    else:
        print("Invalid choice. Please enter 1-4.")


if __name__ == "__main__":
    main()


## Cleaning the csv file

In [None]:
import pandas as pd
import re

def clean_text(text):
    if not isinstance(text, str):
        return text

    # Fix spaces around apostrophes: it 's → it's
    text = re.sub(r"\s+'\s*", "'", text)

    # Fix comma spacing: "word ,word" → "word, word"
    text = re.sub(r"\s*,\s*", ", ", text)
    text = re.sub(r",\s+", ", ", text)  # normalize extra spaces after comma

    # Fix quote spacing: remove space inside quotes → " hello " → "hello"
    text = re.sub(r'"\s*(.*?)\s*"', r'"\1"', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)

    # Strip leading/trailing spaces
    return text.strip()

def clean_csv_text(input_path, output_path):
    df = pd.read_csv(input_path)

    # Apply cleaning to all cells
    df_cleaned = df.applymap(clean_text)

    # Save cleaned output
    df_cleaned.to_csv(output_path, index=False)
    print(f"✅ Cleaned CSV saved to: {output_path}")

# Example usage:
clean_csv_text("/content/final_50_sample_dataset.csv", "/content/cleaned_final_50_sample_dataset.csv")
