In [1]:
import nltk
from nltk.corpus import gutenberg 
from nltk.corpus import stopwords
import pandas as pd  
import numpy as np 
import random 
import re
nltk.download('gutenberg')
nltk.download('stopwords')

def generate_data(books):
    content= [] 
    
    for book in books: 
        a = gutenberg.raw(book)
        content.append(a)

    return content

books = ['austen-emma.txt', 'shakespeare-hamlet.txt', 'melville-moby_dick.txt', 'bible-kjv.txt']

contents = generate_data(books)

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\harka\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import pandas as pd 
import re 
import numpy as np 
import random


def processing(books, num_partitions, words_per_partition ):
    def get_data(text, num_partitions, words_per_partition): 
        """
            Dividing the strings in the partitions
        """
        words = re.findall(r'\b\w+\b', text)
        total_words = len(words)
        partitions = [] 

        for i in range(num_partitions):
            start_index = i * words_per_partition
            end_index = (i + 1) * words_per_partition
            partition_words = words[start_index:end_index]
            partition_text = ' '.join(partition_words)
            partitions.append(partition_text)
        
        return partitions

    output_df = [] 
    """
        Looping through the books
    """
    for i, book in enumerate(books):
        #with open(book, 'r', encoding='utf-8') as file:
        book_text = book
        
        partitions = get_data(book_text, num_partitions, words_per_partition)
        labels = [f'{chr(ord("a") + i)}' for _ in range(len(partitions))]

        data = {"Label": labels, 'Partition': partitions, "Book": [f'Author {i+1}' for _ in range(len(partitions))]}
        df = pd.DataFrame(data)
        df1 = df.apply(np.random.permutation, axis=1)  
        output_df.append(df)

    final_df = pd.concat(output_df, ignore_index=True)
    return final_df

#### Output 
books = contents #['./book1.txt', './book2.txt', './book3.txt']
output = processing(books, 200, 100)
print(output)
output.to_csv('output_dataset.csv', index=False)

    Label                                          Partition      Book
0       a  Emma by Jane Austen 1816 VOLUME I CHAPTER I Em...  Author 1
1       a  her caresses and her place had been supplied b...  Author 1
2       a  together as friend and friend very mutually at...  Author 1
3       a  of any disagreeable consciousness Miss Taylor ...  Author 1
4       a  Weston was a man of unexceptionable character ...  Author 1
..    ...                                                ...       ...
795     d  Now therefore my son obey my voice according t...  Author 4
796     d  and I shall seem to him as a deceiver and I sh...  Author 4
797     d  put the skins of the kids of the goats upon hi...  Author 4
798     d  bless me 27 20 And Isaac said unto his son How...  Author 4
799     d  his hands were hairy as his brother Esau s han...  Author 4

[800 rows x 3 columns]


In [3]:
import pandas as pd 

df = pd.read_csv('D:\App Dev\Python_workspace\output_dataset.csv')

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import string

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(df['Partition'], df['Label'], test_size=0.2, random_state=42)

# Preprocessing: Remove stop words and garbage characters
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stop words
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

# Apply preprocessing to training data
train_data = train_data.apply(preprocess_text)

# Create a pipeline with CountVectorizer and Multinomial Naive Bayes
count_vectorizer_model = make_pipeline(CountVectorizer(), MultinomialNB())

# Train the model with CountVectorizer
count_vectorizer_model.fit(train_data, train_labels)

# Make predictions on the test set
predictions_count_vectorizer = count_vectorizer_model.predict(test_data)

# Evaluate accuracy with CountVectorizer
accuracy_count_vectorizer = accuracy_score(test_labels, predictions_count_vectorizer)
print(f"Accuracy with CountVectorizer: {accuracy_count_vectorizer}")

# Create a pipeline with TF-IDF Vectorizer and Multinomial Naive Bayes
tfidf_vectorizer_model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the model with TF-IDF Vectorizer
tfidf_vectorizer_model.fit(train_data, train_labels)

# Make predictions on the test set
predictions_tfidf_vectorizer = tfidf_vectorizer_model.predict(test_data)

# Evaluate accuracy with TF-IDF Vectorizer
accuracy_tfidf_vectorizer = accuracy_score(test_labels, predictions_tfidf_vectorizer)
print(f"Accuracy with TF-IDF Vectorizer: {accuracy_tfidf_vectorizer}")

Accuracy with CountVectorizer: 0.0
Accuracy with TF-IDF Vectorizer: 0.0
