In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
df = pd.read_csv("Charlotin-hallucination_cases(AutoRecovered).csv")

df = df.drop(columns=["Pointer", "Source", "Details"], errors="ignore")
df = df.dropna(subset=["Outcome"])
text_cols = [col for col in df.columns if df[col].dtype == "object" and col != "Outcome"]
df["combined_text"] = df[text_cols].fillna("").agg(" ".join, axis=1)
df["combined_text"] = df["combined_text"].astype(str).apply(clean_text)

# Remove rare classes
class_counts = df["Outcome"].value_counts()
rare_classes = class_counts[class_counts < 2].index
df = df[~df["Outcome"].isin(rare_classes)]

# Encode target
le = LabelEncoder()
y = le.fit_transform(df["Outcome"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["combined_text"], y, test_size=0.2, random_state=42, stratify=y
)

# ============================
# 2️⃣ TF-IDF Vectorization
# ============================
vectorizer = TfidfVectorizer(max_features=7000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# ============================
# 3️⃣ Logistic Regression
# ============================
log_model = LogisticRegression(
    C=3,                # regularization strength
    solver='liblinear', # good for small/medium datasets
    max_iter=1000,
    random_state=42
)

print("🚀 Training Logistic Regression model...")
log_model.fit(X_train_tfidf, y_train)

# ============================
# 4️⃣ Evaluate
# ============================
y_pred = log_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\n🎯 Accuracy: {accuracy:.4f}")
print(f"🔥 F1 Score: {f1:.4f}")


[nltk_data] Downloading package stopwords to C:\Users\ANAMITRA
[nltk_data]     BAKSHI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\ANAMITRA
[nltk_data]     BAKSHI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


🚀 Training Logistic Regression model...

🎯 Accuracy: 0.6774
🔥 F1 Score: 0.5471

📋 Classification Report:




ValueError: Number of classes, 10, does not match size of target_names, 15. Try specifying the labels parameter