In [5]:
import random
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import gutenberg
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.model_selection import cross_val_score
from transformers import BertTokenizer, BertModel, XLNetTokenizer, XLNetModel
import torch
import matplotlib.pyplot as plt

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('gutenberg')
nltk.download('stopwords')

random.seed(42)
np.random.seed(42)

class GutenbergPartitioner:
    def __init__(self):
        # Select books from a specific genre
        self.all_books = gutenberg.fileids()
        self.selected_books = random.sample(self.all_books, 6)  # Choose 6 books randomly
        self.book_data = [(book, gutenberg.raw(book)) for book in self.selected_books]

    def get_author(self, book_text):
        # Extract author information from book text
        # This is a simplistic approach and may not work for all books
        lines = book_text.split('\n')
        for line in lines:
            if 'by' in line.lower():
                author_index = line.lower().index('by')
                author = line[author_index + 2:].strip()
                # Remove any year like 1909] from author's name
                author = ''.join([i for i in author if not i.isdigit() and i != ']'])
                return author
        return 'Unknown Author'

    def split_into_partitions(self, num_partitions=200, partition_size=100):
        partitions = []
        for book, data in self.book_data:
            author = self.get_author(data)
            words = word_tokenize(data)
            for _ in range(num_partitions):
                start_index = random.randint(0, len(words) - partition_size)
                partition = ' '.join(words[start_index:start_index + partition_size])
                partitions.append((book, author, partition))

        # Shuffle the order of partitions
        random.shuffle(partitions)

        # Label the partitions with alphabetical labels
        labels = {book: chr(ord('a') + i) for i, book in enumerate(self.selected_books)}

        # Add the labels to the partitions
        labeled_partitions = [(labels[book], author, partition) for book, author, partition in partitions]
        return labeled_partitions

    def save_to_csv(self, labeled_partitions, output_filename='random_partitions.csv'):
        # Serialize the data using Pandas
        df = pd.DataFrame(labeled_partitions, columns=['Book', 'Author', 'Partition'])
        df.to_csv(output_filename, index=False)

def preprocess_text(text):
    # Tokenize, remove stopwords, and non-alphabetic characters
    tokens = nltk.word_tokenize(text)
    stopwords = set(nltk.corpus.stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stopwords]

    # Perform stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(token)) for token in stemmed_tokens]

    return ' '.join(lemmatized_tokens)

def get_wordnet_pos(word):
    # Map POS tag to first character lemmatize() accepts
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def train_model(classifier, vectorizer):
    partitioner = GutenbergPartitioner()
    labeled_partitions = partitioner.split_into_partitions()
    partitioner.save_to_csv(labeled_partitions)

    # Load labeled partitions from CSV
    df = pd.read_csv('random_partitions.csv')

    # Preprocess the data
    df['Processed_Partition'] = df['Partition'].apply(preprocess_text)

    # Feature extraction using TF-IDF
    features = vectorizer.fit_transform(df['Processed_Partition']).toarray()
    print(features)
    labels = df['Book']
    authors = df['Author']  # Include authors as a separate column

    # Train a machine learning model
    classifier.fit(features, authors)  # Change labels to authors

    return classifier, vectorizer, authors

def evaluate_model(classifier, vectorizer, authors):
    # Load labeled partitions from CSV
    df = pd.read_csv('random_partitions.csv')

    # Preprocess the data
    df['Processed_Partition'] = df['Partition'].apply(preprocess_text)

    # Feature extraction using TF-IDF
    features = vectorizer.transform(df['Processed_Partition']).toarray()
    labels = df['Book']

    # Perform k-fold cross-validation
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    roc_auc_scores = []

    for train_index, test_index in kf.split(features):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = authors[train_index], authors[test_index]  # Change labels to authors

        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
        recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
        roc_auc_scores.append(roc_auc_score(y_test, classifier.predict_proba(X_test), multi_class='ovr'))

    # Calculate bias and variance
    bias = 1 - np.mean(accuracy_scores)
    variance = np.var(accuracy_scores)

    return {
        'Bias': bias,
        'Variance': variance,
        'Accuracy': np.mean(accuracy_scores),
        'Precision': np.mean(precision_scores),
        'Recall': np.mean(recall_scores),
        'F1-score': np.mean(f1_scores),
        'ROC-AUC': np.mean(roc_auc_scores)
    }

def print_results_subset(df, test_labels, predictions, authors, start_idx, end_idx):
    print(f"\nSubset of results:")
    print("{:<10} {:<30} {:<30} {:<30}".format("Book", "True Author", "Predicted Book", "Predicted Author"))
    print("="*100)
    for i in range(start_idx, end_idx):
        book_label = df.loc[i, 'Book']
        true_author = authors[i]  # Extract true author from the authors list
        predicted_book = test_labels[i]  # Extract predicted book from the test_labels list
        predicted_author = predictions[i]
        print("{:<10} {:<30} {:<30} {:<30}".format(book_label, true_author, predicted_book, predicted_author))

def plot_bias_variance(classifiers, results):
    classifier_names = list(classifiers.keys())
    bias_values = [result['Bias'] for result in results.values()]
    variance_values = [result['Variance'] for result in results.values()]

    plt.figure(figsize=(10, 5))
    plt.plot(classifier_names, bias_values, marker='o', label='Bias')
    plt.plot(classifier_names, variance_values, marker='o', label='Variance')
    plt.xlabel('Classifier')
    plt.ylabel('Bias/Variance')
    plt.title('Bias and Variance of Classifiers')
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()


def main():
    classifiers = {
        'Naive Bayes': MultinomialNB(),
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Support Vector Machine': SVC(probability=True),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'Gradient Boosting': GradientBoostingClassifier(),
        'KNN': KNeighborsClassifier(),  # Add KNN Classifier
    }

    results = {}

    for name, classifier in classifiers.items():
        print(f"Evaluating {name}...")
        vectorizer = TfidfVectorizer(max_features=1000)
        trained_classifier, trained_vectorizer, authors = train_model(classifier, vectorizer)
        evaluation_results = evaluate_model(trained_classifier, trained_vectorizer, authors)
        results[name] = evaluation_results
        print(f"Results for {name}:")
        for metric, value in evaluation_results.items():
            print(f"{metric}: {value}")

        df = pd.read_csv('random_partitions.csv')
        df['Processed_Partition'] = df['Partition'].apply(preprocess_text)
        test_features = trained_vectorizer.transform(df['Processed_Partition']).toarray()
        test_labels = df['Book']
        predictions = trained_classifier.predict(test_features)

        # Print subset of results
        start_idx = 10
        end_idx = 30
        print_results_subset(df, test_labels, predictions, authors, start_idx, end_idx)

    plot_bias_variance(classifiers, results)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Jainil\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jainil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Jainil\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jainil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Evaluating Naive Bayes...


LookupError: 
**********************************************************************
  Resource [93momw-1.4[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('omw-1.4')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/omw-1.4[0m

  Searched in:
    - 'C:\\Users\\Jainil/nltk_data'
    - 'd:\\Anaconda\\nltk_data'
    - 'd:\\Anaconda\\share\\nltk_data'
    - 'd:\\Anaconda\\lib\\nltk_data'
    - 'C:\\Users\\Jainil\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
