In [26]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# ============================
# 1Ô∏è‚É£ Load & Clean Data
# ============================
df = pd.read_csv("Charlotin-hallucination_cases(AutoRecovered).csv")

# Drop unnecessary columns if present
df = df.drop(columns=["Pointer", "Source", "Details"], errors="ignore")

# Drop rows without outcomes
df = df.dropna(subset=["Outcome"])

# Clean text function
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

# Combine all text columns into one
text_cols = [col for col in df.columns if df[col].dtype == "object" and col != "Outcome"]
df["combined_text"] = df[text_cols].fillna("").agg(" ".join, axis=1)
df["combined_text"] = df["combined_text"].astype(str).apply(clean_text)

# Remove rare classes (less than 2 samples)
class_counts = df["Outcome"].value_counts()
rare_classes = class_counts[class_counts < 2].index
df = df[~df["Outcome"].isin(rare_classes)]

# Encode labels
le = LabelEncoder()
y = le.fit_transform(df["Outcome"])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df["combined_text"], y, test_size=0.2, random_state=42, stratify=y
)

# ============================
# 2Ô∏è‚É£ Linear SVC Model
# ============================
model = make_pipeline(
    TfidfVectorizer(max_features=5000, ngram_range=(1, 2)),
    LinearSVC(C=1.0, class_weight="balanced", random_state=42)
)

# Train the model
print("üöÄ Training Linear SVC model...")
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\n‚úÖ Model trained successfully!")
print(f"üéØ Linear SVC Accuracy: {accuracy * 100:.2f}%\n")

# ============================
# 3Ô∏è‚É£ User Input Prediction
# ============================
while True:
    user_input = input("üí¨ Enter a new case description (or type 'exit' to quit): ").strip()
    if user_input.lower() == "exit":
        print("üëã Exiting. Have a great day!")
        break

    # Clean user input
    cleaned = clean_text(user_input)

    # Predict
    prediction = model.predict([cleaned])[0]
    predicted_label = le.inverse_transform([prediction])[0]

    print(f"üß† Predicted Outcome: {predicted_label}\n")


üöÄ Training Linear SVC model...

‚úÖ Model trained successfully!
üéØ Linear SVC Accuracy: 70.97%



üí¨ Enter a new case description (or type 'exit' to quit):  hallucination fake evidence AI tool ChatGPT USA


üß† Predicted Outcome: Case dismissed



üí¨ Enter a new case description (or type 'exit' to quit):  exit


üëã Exiting. Have a great day!
