In [12]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# ============================
# 1️⃣ Load & Clean Data
# ============================
df = pd.read_csv("Charlotin-hallucination_cases(AutoRecovered).csv")

# Drop unnecessary columns
df = df.drop(columns=["Pointer", "Source", "Details"], errors="ignore")

# Drop missing outcomes
df = df.dropna(subset=["Outcome"])

# Clean text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

# Combine text columns
text_cols = [col for col in df.columns if df[col].dtype == "object" and col != "Outcome"]
df["combined_text"] = df[text_cols].fillna("").agg(" ".join, axis=1)
df["combined_text"] = df["combined_text"].astype(str).apply(clean_text)

# Remove rare classes (less than 2 examples)
class_counts = df["Outcome"].value_counts()
rare_classes = class_counts[class_counts < 2].index
df = df[~df["Outcome"].isin(rare_classes)]

# Encode labels
le = LabelEncoder()
y = le.fit_transform(df["Outcome"])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df["combined_text"], y, test_size=0.2, random_state=42, stratify=y
)

# ============================
# 2️⃣ Linear SVC Model
# ============================
model = make_pipeline(
    TfidfVectorizer(max_features=5000, ngram_range=(1, 2)),
    LinearSVC(C=1.0, random_state=42)
)

# Train
print("🚀 Training Linear SVC...")
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\n🎯 Linear SVC Accuracy: {accuracy * 100:.2f}%")



🚀 Training Linear SVC...

🎯 Linear SVC Accuracy: 70.97%
