In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
from sklearn.inspection import permutation_importance
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import numpy as np
import matplotlib.pyplot as plt

# --- 1. Data Preparation (Rerun successful steps) ---
stop_words_list = list(ENGLISH_STOP_WORDS)
stop_words_pattern = re.compile(r'\b(' + r'|'.join(stop_words_list) + r')\b\s*')

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\S+|#\S+', '', text)
    translator = str.maketrans('', '', string.punctuation + string.digits)
    text = text.translate(translator)
    text = re.sub(r'\s+', ' ', text).strip()
    text = stop_words_pattern.sub('', text)
    return text

file_name = 'judge-1377884607_tweet_product_company.csv'
df = pd.read_csv(file_name, encoding='latin-latin-1')
df.rename(columns={'is_there_an_emotion_directed_at_a_brand_or_product': 'sentiment'}, inplace=True)

df_multiclass = df[df['sentiment'] != "I can't tell"].copy()
df_multiclass.dropna(subset=['tweet_text'], inplace=True)
df_multiclass['preprocessed_text'] = df_multiclass['tweet_text'].apply(preprocess_text)
df_multiclass = df_multiclass[df_multiclass['preprocessed_text'].str.len() > 0].copy()

sentiment_mapping = {
    'Negative emotion': 0,
    'No emotion toward brand or product': 1,
    'Positive emotion': 2
}
df_multiclass['target'] = df_multiclass['sentiment'].map(sentiment_mapping)

X = df_multiclass['preprocessed_text']
y = df_multiclass['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 2. Advanced Modeling: SVC Pipeline Training ---

advanced_svc_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english', 
        ngram_range=(1, 2), 
        min_df=5 
    )),
    ('svc', SVC(
        kernel='linear', 
        C=1.0, 
        random_state=42, 
        class_weight='balanced',
        probability=False
    ))
])

advanced_svc_pipe.fit(X_train, y_train)

# --- 3. Advanced Explainability: Permutation Importance ---

# Define a scoring function for permutation importance (using macro F1 for better imbalance sensitivity)
def f1_macro_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return f1_score(y, y_pred, average='macro')

# The permutation importance needs to be run on the fitted vectorizer output, not raw text.
# We create a temporary pipeline for vectorization only, then apply it to X_test.
X_test_transformed = advanced_svc_pipe.named_steps['tfidf'].transform(X_test)

print("Starting Permutation Importance Calculation...")
r = permutation_importance(
    advanced_svc_pipe.named_steps['svc'], # Use the trained SVC model
    X_test_transformed, # Use the transformed features
    y_test, 
    scoring=f1_macro_scorer, 
    n_repeats=5, 
    random_state=42,
    n_jobs=-1 # Use all processors
)
print("Permutation Importance Calculation Complete.")

# --- 4. Plotting and Analysis ---

feature_names = advanced_svc_pipe.named_steps['tfidf'].get_feature_names_out()

# Get the top 15 features
sorted_idx = r.importances_mean.argsort()[-15:]
top_feature_names = feature_names[sorted_idx]
top_importances = r.importances_mean[sorted_idx]

# Plotting the results
fig, ax = plt.subplots(figsize=(10, 8))
ax.barh(top_feature_names, top_importances)
ax.set_xlabel("Permutation Importance (Mean Decrease in Macro F1-Score)")
ax.set_title("Top 15 Features by Permutation Importance")
plt.tight_layout()
plt.savefig("permutation_importance.png")
print("Saved permutation_importance.png")

FileNotFoundError: [Errno 2] No such file or directory: 'judge-1377884607_tweet_product_company.csv'