In [None]:
# Authorship Detection
# Ameen Mohamed
# Project: Feature Extraction & Classification (ML)

import random
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download NLTK data for stopwords and wordnet if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# ml
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Setup
random.seed(42)

# =====================================
# 1) DATASET PREPARATION
# =====================================
texts_data = {
    "Shakespeare": [
        "To be, or not to be: that is the question.",
        "All the world's a stage, and all the men and women merely players.",
        "Some are born great, some achieve greatness, and some have greatness thrust upon them.",
        "The course of true love never did run smooth.",
        "Love all, trust a few, do wrong to none.",
        "Cowards die many times before their deaths; the valiant never taste of death but once."
    ],
    "Jane Austen": [
        "It is a truth universally acknowledged, that a single man in possession of a good fortune must be in want of a wife.",
        "A lady's imagination is very rapid; it jumps from admiration to love, from love to matrimony.",
        "There is no charm equal to tenderness of heart.",
        "I declare after all there is no enjoyment like reading!",
        "Vanity and pride are different things, though the words are often used synonymously.",
        "One half of the world cannot understand the pleasures of the other."
    ],
    "Mark Twain": [
        "The secret of getting ahead is getting started.",
        "Kindness is the language which the deaf can hear and the blind can see.",
        "Truth is stranger than fiction, but it is because Fiction is obliged to stick to possibilities.",
        "The lack of money is the root of all evil.",
        "The best way to cheer yourself up is to try to cheer somebody else up.",
        "Courage is resistance to fear, mastery of fear, not absence of fear."
    ],
    "Charles Dickens": [
        "It was the best of times, it was the worst of times.",
        "Have a heart that never hardens, and a temper that never tires.",
        "A loving heart is the truest wisdom.",
        "No one is useless in this world who lightens the burden of another.",
        "There is a wisdom of the head, and a wisdom of the heart.",
        "We forge the chains we wear in life."
    ],
    "Edgar Allan Poe": [
        "I became insane, with long intervals of horrible sanity.",
        "All that we see or seem is but a dream within a dream.",
        "Words have no power to impress the mind without the exquisite horror of their reality.",
        "Those who dream by day are cognizant of many things which escape those who dream only by night.",
        "The boundaries which divide Life from Death are at best shadowy and vague.",
        "There is no beauty without some strangeness."
    ]
}

# transform data to lists and tables
data_list = []
for author, sentences in texts_data.items():
    for sent in sentences:
        data_list.append({"text": sent, "label": author})

df = pd.DataFrame(data_list)
print(f"Total Samples: {len(df)}")
print(df.head())
print("-" * 50)

# =====================================
# 2) PREPROCESSING
# =====================================
STOPWORDS = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    tokens = word_tokenize(text.lower())
    cleaned = []
    for w in tokens:
        if w.isalpha():
            if w not in STOPWORDS:
                cleaned.append(lemmatizer.lemmatize(w))
    return " ".join(cleaned)

df['cleaned_text'] = df['text'].apply(clean_text)

# =====================================
# 3) EXPERIMENT: COMPARE 3 TECHNIQUES
# =====================================

# Split Data (80% Train, 20% Test)
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['cleaned_text'], df['label'], test_size=0.3, random_state=42, stratify=df['label']
)

# تعريف الـ 3 طرق المطلوبة
feature_extractors = {
    "1. Binary Encoding": CountVectorizer(binary=True),
    "2. Count Encoding":  CountVectorizer(binary=False),
    "3. TF-IDF":          TfidfVectorizer()
}

results = []

print("\n=== STARTING EVALUATION ===\n")

for name, vectorizer in feature_extractors.items():
    print(f"Applying: {name} ...")

    # 1. Transform Text to Numbers (Feature Extraction)
    X_train = vectorizer.fit_transform(X_train_text)
    X_test  = vectorizer.transform(X_test_text)

    # 2. Train Classifier (Naive Bayes)
    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    # 3. Predict & Evaluate
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # تخزين النتائج
    results.append({"Technique": name, "Accuracy": acc})

    print(f"Accuracy: {acc*100:.2f}%")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 30)

# =====================================
# 4) FINAL REPORT
# =====================================
print("\n=== FINAL COMPARISON REPORT ===")
final_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)
print(final_df)

best_method = final_df.iloc[0]['Technique']
print(f"\n✅ The best performing technique for this dataset is: {best_method}")