In [None]:
!pip install -q gradio
import pandas as pd
import numpy as np
import re
from collections import Counter
import os
import joblib
import gradio as gr

# NLTK for text processing (ensure resources are downloaded as in the main script)
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords # Not directly used in classify_text_pair but part of original context

In [None]:
ENGLISH_FUNCTION_WORDS = [ # Keep this consistent with the training script
    'a', 'an', 'the', 'and', 'but', 'or', 'nor', 'for', 'so', 'yet', 'if', 'whether',
    'in', 'on', 'at', 'by', 'from', 'to', 'with', 'without', 'about', 'above', 'across', 'after',
    'against', 'along', 'among', 'around', 'as', 'because', 'before', 'behind', 'below',
    'beneath', 'beside', 'between', 'beyond', 'concerning', 'despite', 'down', 'during',
    'except', 'inside', 'into', 'like', 'near', 'of', 'off', 'onto', 'out', 'outside',
    'over', 'past', 'regarding', 'since', 'through', 'throughout', 'toward', 'under',
    'underneath', 'until', 'unto', 'up', 'upon', 'within',
    'i', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
    'we', 'us', 'our', 'ours', 'ourselves', 'they', 'them', 'their', 'theirs', 'themselves',
    'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
    'do', 'does', 'did', 'doing', 'can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must',
    'what', 'which', 'who', 'whom', 'whose', 'when', 'where', 'why', 'how',
    'all', 'any', 'both', 'each', 'either', 'enough', 'every', 'few', 'less', 'little', 'many',
    'more', 'most', 'much', 'neither', 'no', 'none', 'one', 'other', 'several', 'some', 'such', 'that',
    'these', 'this', 'those', 'very', 'just', 'not', 'only', 'quite', 'rather', 'too', 'even'
]
ENGLISH_STOP_WORDS = set(stopwords.words('english')) # Define if used by any get_... function


def preprocess_text_for_stylometry(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-z0-9\s.,!?;:\'"-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_text_for_ngrams(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def get_vocabulary_richness(tokens):
    if not tokens: return {'ttr': 0.0, 'unique_word_count': 0, 'word_count': 0}
    word_count = len(tokens)
    unique_word_count = len(set(tokens))
    ttr = unique_word_count / word_count if word_count > 0 else 0.0
    return {'ttr': ttr, 'unique_word_count': unique_word_count, 'word_count': word_count}

def get_sentence_length_stats(text_original):
    if not isinstance(text_original, str) or not text_original.strip():
        return {'avg_sent_len': 0.0, 'std_sent_len': 0.0, 'sentence_count': 0}
    try:
        sentences = sent_tokenize(text_original)
    except LookupError:
        nltk.download('punkt', quiet=True)
        sentences = sent_tokenize(text_original)
    if not sentences: return {'avg_sent_len': 0.0, 'std_sent_len': 0.0, 'sentence_count': 0}
    sent_lengths = [len(word_tokenize(s)) for s in sentences]
    return {
        'avg_sent_len': np.mean(sent_lengths) if sent_lengths else 0.0,
        'std_sent_len': np.std(sent_lengths) if len(sent_lengths) > 1 else 0.0,
        'sentence_count': len(sentences)
    }

def get_punctuation_stats(text_original):
    if not isinstance(text_original, str) or not text_original:
        return {'comma_freq': 0.0, 'period_freq': 0.0, 'question_freq': 0.0, 'exclam_freq': 0.0, 'semicolon_freq': 0.0, 'colon_freq': 0.0, 'hyphen_freq': 0.0, 'quote_freq': 0.0}
    total_chars = len(text_original) if len(text_original) > 0 else 1
    return {
        'comma_freq': text_original.count(',') / total_chars, 'period_freq': text_original.count('.') / total_chars,
        'question_freq': text_original.count('?') / total_chars, 'exclam_freq': text_original.count('!') / total_chars,
        'semicolon_freq': text_original.count(';') / total_chars, 'colon_freq': text_original.count(':') / total_chars,
        'hyphen_freq': text_original.count('-') / total_chars, 'quote_freq': (text_original.count('"') + text_original.count("'")) / total_chars,
    }

def get_function_word_proportions(tokens, function_words_set):
    if not tokens: return {'func_word_prop': 0.0}
    func_word_count = sum(1 for token in tokens if token in function_words_set)
    return {'func_word_prop': func_word_count / len(tokens) if len(tokens) > 0 else 0.0}

def extract_stylometric_features_single_text(text_original, text_for_stylometry, tokens_for_stylometry):
    features = {}
    features.update(get_vocabulary_richness(tokens_for_stylometry))
    features.update(get_sentence_length_stats(text_original))
    features.update(get_punctuation_stats(text_original))
    features.update(get_function_word_proportions(tokens_for_stylometry, set(ENGLISH_FUNCTION_WORDS)))
    return features


def load_all_artifacts(artifacts_dir="/kaggle/input/help-me-2/saved_authorship_artifacts"):
    """
    Loads all saved artifacts: vectorizer, feature names, scaler, and models.
    """
    if not os.path.exists(artifacts_dir):
        raise FileNotFoundError(f"Artifacts directory '{artifacts_dir}' not found. Please run the training script first.")

    loaded_artifacts = {}

    # Load vectorizer
    vectorizer_path = os.path.join(artifacts_dir, 'char_ngram_vectorizer.joblib')
    loaded_artifacts['char_ngram_vectorizer'] = joblib.load(vectorizer_path)
    print(f"Vectorizer loaded from {vectorizer_path}")

    # Load feature names
    feature_names_path = os.path.join(artifacts_dir, 'final_feature_names.joblib')
    loaded_artifacts['final_feature_names_with_diff'] = joblib.load(feature_names_path)
    print(f"Feature names loaded from {feature_names_path}")

    # Load scaler
    scaler_path = os.path.join(artifacts_dir, 'scaler.joblib')
    loaded_artifacts['scaler'] = joblib.load(scaler_path)
    print(f"Scaler loaded from {scaler_path}")

    # Load models
    loaded_artifacts['models'] = {}
    model_files = [f for f in os.listdir(artifacts_dir) if f.endswith('_model.joblib')]
    if not model_files:
        print("Warning: No model files found in the artifacts directory.")
    for model_file in model_files:
        model_name_key = model_file.replace('_model.joblib', '').replace('_', ' ').title() # e.g., "Logistic Regression"
        model_path = os.path.join(artifacts_dir, model_file)
        try:
            loaded_artifacts['models'][model_name_key] = joblib.load(model_path)
            print(f"Model '{model_name_key}' loaded from {model_path}")
        except Exception as e:
            print(f"Error loading model {model_file}: {e}")
            
    # Derive base stylometric feature names (needed for processing new texts)
    # These are the names of features *before* "diff_" prefix and *excluding* char_ngrams
    final_names = loaded_artifacts['final_feature_names_with_diff']
    base_stylometric_names = []
    for name in final_names:
        if name.startswith("diff_") and "char_ngram" not in name:
            base_stylometric_names.append(name.replace("diff_", "", 1))
    loaded_artifacts['base_stylometric_feature_names'] = sorted(list(set(base_stylometric_names))) # Ensure uniqueness and order

    return loaded_artifacts

## 2. Function to Classify a Text Pair
def classify_text_pair(text1, text2, model, char_ngram_vectorizer, scaler, 
                       base_stylometric_feature_names, final_feature_names_with_diff):
    """
    Classifies a pair of texts using a loaded model and artifacts.
    """
    # Preprocess texts
    text1_original_for_style = text1
    text2_original_for_style = text2
    text1_stylometry = preprocess_text_for_stylometry(text1)
    text2_stylometry = preprocess_text_for_stylometry(text2)
    text1_stylometry_tokens = word_tokenize(text1_stylometry)
    text2_stylometry_tokens = word_tokenize(text2_stylometry)
    text1_for_ngrams = preprocess_text_for_ngrams(text1)
    text2_for_ngrams = preprocess_text_for_ngrams(text2)

    # Extract stylometric features
    style_features1_dict = extract_stylometric_features_single_text(text1_original_for_style, text1_stylometry, text1_stylometry_tokens)
    style_features2_dict = extract_stylometric_features_single_text(text2_original_for_style, text2_stylometry, text2_stylometry_tokens)

    # Create ordered stylometric feature vectors based on base_stylometric_feature_names
    # This ensures the order matches what was used in training before differencing.
    vec_style1 = np.array([style_features1_dict.get(name, 0.0) for name in base_stylometric_feature_names])
    vec_style2 = np.array([style_features2_dict.get(name, 0.0) for name in base_stylometric_feature_names])
    diff_stylometric_features = np.abs(vec_style1 - vec_style2)

    # Extract character n-gram features
    char_ngrams1_matrix = char_ngram_vectorizer.transform([text1_for_ngrams])
    char_ngrams2_matrix = char_ngram_vectorizer.transform([text2_for_ngrams])
    diff_char_ngrams = np.abs(char_ngrams1_matrix.toarray() - char_ngrams2_matrix.toarray()).flatten()

    # Combine features
    # The order of concatenation must match the order in final_feature_names_with_diff
    # We assume stylometric diffs come first, then char_ngram diffs, as in the training script.
    combined_diff_features_vector = np.concatenate([diff_stylometric_features, diff_char_ngrams])
    
    # Reshape for scaler and model (1 sample, N features)
    combined_diff_features_df = pd.DataFrame([combined_diff_features_vector], columns=final_feature_names_with_diff)

    # Scale features
    # Note: Use only the columns that the scaler was trained on, if there's a mismatch.
    # However, final_feature_names_with_diff *should* be what the scaler expects.
    scaled_features = scaler.transform(combined_diff_features_df)

    # Predict
    prediction = model.predict(scaled_features)[0]
    probability = 0.0
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(scaled_features)[0]
        probability = probs[1] if prediction == 1 else probs[0] # Probability of the predicted class

    return prediction, probability, probs

In [None]:
def create_gradio_demo(loaded_artifacts_dict):
    """
    Creates and launches a Gradio demo for authorship verification.
    """
    model_choices = list(loaded_artifacts_dict['models'].keys())
    if not model_choices:
        print("No models loaded. Cannot start Gradio demo.")
        return

    char_ngram_vectorizer = loaded_artifacts_dict['char_ngram_vectorizer']
    scaler = loaded_artifacts_dict['scaler']
    final_feature_names = loaded_artifacts_dict['final_feature_names_with_diff']
    base_stylometric_names = loaded_artifacts_dict['base_stylometric_feature_names']


    def predict_authorship(text1, text2, model_name_choice):
        if not text1.strip() or not text2.strip():
            return "Error: Both text fields must be filled.", 0.0
        
        selected_model_instance = loaded_artifacts_dict['models'][model_name_choice]
        
        # Determine if the chosen model expects scaled data (mimicking training script logic)
        # This logic should ideally be stored/inferred more robustly
        scaled_data_models = ["Logistic Regression", "Svm (Linear Kernel)", "Svm (Rbf Kernel)", "Gaussian Naive Bayes"] # Title case
        
        # The classify_text_pair function now always returns features to be scaled,
        # and the scaler is applied within it.
        # The specific model's sensitivity to scaling is handled by whether it was trained on scaled/unscaled data.
        # For inference, we should always pass the data through the same pipeline (including scaling if a scaler was saved).
        
        pred_label, pred_prob, probs = classify_text_pair(
            text1, text2, selected_model_instance,
            char_ngram_vectorizer, scaler,
            base_stylometric_names, final_feature_names
        )
    
        result_text = "Same Author" if probs[1] > .6 else "Different Authors"
        
        # For Gradio label output, return a dictionary for colored labels
        label_output = {'Same Author': pred_prob - .1,
                       'Different Author': 1.1 - pred_prob}
        
        return label_output, f"{pred_prob:.4f}"


    iface = gr.Interface(
        fn=predict_authorship,
        inputs=[
            gr.Textbox(lines=10, placeholder="Enter first text here...", label="Text 1"),
            gr.Textbox(lines=10, placeholder="Enter second text here...", label="Text 2"),
            gr.Dropdown(choices=model_choices, label="Choose Model", value=model_choices[0] if model_choices else None)
        ],
        outputs=[
            # gr.Textbox(label="Result"), # Simpler text output
            # gr.Textbox(label="Confidence Score") # Simpler text output for confidence
            gr.Label(label="Authorship Prediction"), # Using Label for richer output
            gr.Textbox(label="Confidence in Prediction")
        ],
        title="Authorship Verification Demo ✍️",
        description="Enter two texts and select a model to predict if they were written by the same author. "+ \
                    "The confidence score reflects the model's certainty in the predicted class.",
        allow_flagging="never"
    )
    
    print("Launching Gradio demo...")
    iface.launch()

In [None]:
artifacts = load_all_artifacts()

In [None]:
list(artifacts['models'].keys())

In [None]:
mappings = {v:k for k, v in artifacts['char_ngram_vectorizer'].vocabulary_.items()}

In [None]:
mappings[869]

In [None]:
create_gradio_demo(artifacts)

In [None]:
def highlight_characters_in_texts(text1: str, text2: str, char_group: str):
    """
    Prints two texts, highlighting occurrences of a specified character group with a red background.

    Args:
        text1 (str): The first text string.
        text2 (str): The second text string.
        char_group (str): The group of characters to highlight.
                          Highlighting is case-insensitive.
    """
    # ANSI escape codes for red background and resetting color
    RED_BACKGROUND = "\033[41m"  # 41m for red background
    RESET = "\033[0m"

    # Check if the character group is empty. If so, print original texts.
    if not char_group:
        print("--- Text 1 (No highlighting as character group is empty) ---")
        print(text1)
        print("\n--- Text 2 (No highlighting as character group is empty) ---")
        print(text2)
        return

    # Create a case-insensitive replacement function
    def replace_case_insensitive(text, old, new_format):
        # Use a regular expression for case-insensitive global replacement
        import re
        # Escape special regex characters in the char_group
        escaped_char_group = re.escape(old)
        # Replace the matched group with the format, the group itself, and then reset
        return re.sub(escaped_char_group, f"{new_format}\\g<0>{RESET}", text, flags=re.IGNORECASE)

    # Highlight text1
    highlighted_text1 = replace_case_insensitive(text1, char_group, RED_BACKGROUND)

    # Highlight text2
    highlighted_text2 = replace_case_insensitive(text2, char_group, RED_BACKGROUND)

    print("Text 1:")
    print(highlighted_text1)
    print("\nText 2:")
    print(highlighted_text2)

In [None]:
t1 = "yeah so let's see here I mean I got into tech because I was obsessed with computers when I was in high school when I thought I would credit them I really wasn't but I kept being interested in it in it so I needed up taking computer science for my undergraduate it was until like my first internship where I realized that this is really boring I started reading or like leads to stuff that I was doing was really boring I started reading a lot more software engineering literature and actually got a lot deeper into I guess the trade then I had initially thought I was going to be and I did I think that's kind of like why I'm here okay as I I still think it's a very interesting way to build out things that are useful for people yeah how I ended up in data is I've always been kind of a database nerd I guess or how how I store data efficiently I think there's been a lot of interesting problems surrounding that and I don't know I just kept digging into those kind of rules my first role out of my undergraduate degree was working on AWS in storage mm-hmm when I was working at a start-up I liked the projects where I got to build out our own data warehouse and then when I applied to next door I found most of my interests being around how to efficiently transform data it's it being something that could be searched and transformed very efficiently so I think that's like largely the journey that I've been through so far"
t2 = "yeah I mean before we dive into the question I guess I have a few questions for you what kind of database yeah okay so with if we don't want to incur any downtime I guess my question to you is what impact with downtime have I would I would actually do this in two phases okay and it also depends on like it can we have like a default value attached to so Postgres allows you to have default values without actually writing the actual call value into the record okay so we can use that as leverage then we don't even have to really backfill we just have to apply our table when our table definition change and then we're good to go so I would do this in phases I think it makes sense to lock out chunks of Records and update those values but let's initially just have them per second so let's say if there's a billion rows then let's say that there gets like a million rows added per day and then let's say like if we extrapolate from that like probably like like a couple like a thousand records per second that's probably not right but let's just say a thousand records per second okay yeah so one way to kind of do this is to create a table that is identical to the other table but with an extra record and have all new records write to that new table and then we'll slowly start copying data from that old table to the new table with whatever default value that we want to give at that table and when we're reading from that table we have to read from both tables for the short for the short term okay eventually after all of the old records are copied over to the new table we can drop the old table and move on with our lives and then we have we effectively have our new column with no downtime um I think the questions that were asked were intentionally left vague no I definitely have to probe and get a better understanding of like where our constraints lives yep so that I could give a good ish answer I mean lemme answer so kind of hand wavy if I were being quite honest but I think like I think high level I hit them okay I thought that one's hard to answer actually um I tend to play it by ear so I think if you ask too many questions then you're not doing any thinking or any Discovery on your own yeah which doesn't but well in debrief so it's good to have some ideas talk aloud on those kind of ideas and even just ask for feedback on what the interviewers thinks of like your idea yep um and I think that's good I think I think if you listen you also need to listen to the interviewer and pick up on any sort of hints that they may give you so when you said do things letter by letter immediately clicked in my head that was a hint yeah and if I pick up on the hand then the interviewer knows that I picked up on the hand and can probably think or I think the interview will think like okay this person probably knows yeah it what's going on here and what I was thinking in terms of structure I think it makes a lot of sense"
a = highlight_characters_in_texts(t1, t2, mappings[2692])