In [2]:
import os
import re
import random
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

In [3]:
# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/kish/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove unwanted characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove white spaces
    text = text.strip()
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # Join words back into text
    preprocessed_text = ' '.join(words)
    
    return preprocessed_text

In [5]:
# Function to apply lemmatization and tokenization
def lemmatize_and_tokenize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [6]:
# Function to create random samples of 150 words
def create_samples(text, num_samples=200, words_per_sample=150):
    samples = []
    words = text.split()
    
    # Calculate maximum number of samples that can be created
    max_samples = len(words) // words_per_sample
    
    if max_samples < num_samples:
        print(f"Not enough words to create {num_samples} samples. Creating {max_samples} samples instead.")
        num_samples = max_samples
    
    # Create samples
    for i in range(num_samples):
        start_index = random.randint(0, len(words) - words_per_sample)
        sample = ' '.join(words[start_index:start_index + words_per_sample])
        samples.append(sample)
    
    return samples

In [15]:
# Function to process books
def process_books(book_paths):
    df_data = []
    
    for book_path in book_paths:

        book_name = os.path.basename(book_path)
        # Preprocess the book
        with open(book_path, 'r', encoding='utf-8') as f:
            text = f.read()
            preprocessed_text = preprocess_text(text)
        
        # Save preprocessed book
        preprocessed_file = os.path.splitext(book_path)[0] + "_data.txt"
        with open(preprocessed_file, 'w', encoding='utf-8') as f:
            f.write(preprocessed_text)
        
        # Apply lemmatization and tokenization
        lemmatized_text = lemmatize_and_tokenize(preprocessed_text)
        
        # Save lemmatized book
        lemmatized_file = os.path.splitext(book_path)[0] + "_lemmatized.txt"
        with open(lemmatized_file, 'w', encoding='utf-8') as f:
            f.write(lemmatized_text)
        
        # Create samples
        print(book_name)
        samples = create_samples(lemmatized_text)
        
        
        # Add samples to DataFrame
        for sample in samples:
            df_data.append((sample, book_name))

    
    # Create DataFrame
    df = pd.DataFrame(df_data, columns=['Sample', 'Book_Name'])
    
    return df

In [23]:
# Main function
def main():
    # Paths to input TXT files for seven books
    book_paths = [
        "Dataset/Murder in the Gunroom.txt",
        "Dataset/The Crime Club.txt",
        "Dataset/The Devil Doctor.txt",
        "Dataset/The House of Arrow.txt",
        "Dataset/The Wrong Letter.txt",
        "Dataset/The mystery of blue train.txt",
        "Dataset/Time Crime.txt"
    ]
    
    # Process books
    df = process_books(book_paths)
    
    # Display DataFrame
    print(df)
    print(df['Book_Name'].unique())
    print("----")
    
    # Save DataFrame to CSV file
    df.to_csv('book_samples.csv', index=False)

    # Read data from CSV file
    data = pd.read_csv('book_samples.csv')

    # Shuffle the data
    data_shuffled = shuffle(data,random_state=42)

    # Perform label encoding on the target variable
    label_encoder = LabelEncoder()
    y_actual= label_encoder.fit_transform(data_shuffled['Book_Name'])
    data_shuffled['Book_Name']=y_actual

    # Write the processed data to a new CSV file
    data_shuffled.to_csv('encoded_books_data.csv', index=False)

    print("Data processing completed and saved to processed_data.csv")

if __name__ == "__main__":
    main()

Murder in the Gunroom.txt
The Crime Club.txt
The Devil Doctor.txt
The House of Arrow.txt
The Wrong Letter.txt
Not enough words to create 200 samples. Creating 127 samples instead.
The mystery of blue train.txt
Time Crime.txt
Not enough words to create 200 samples. Creating 127 samples instead.
                                                 Sample  \
0     gresham couple day blowup late use rand said i...   
1     case arm like cordwood still saying nothing ey...   
2     asked gun river shop time umholtz rejuvenated ...   
3     probably attributable even barbarous people sc...   
4     monolithic fact officially attested indisputab...   
...                                                 ...   
1249  weapon want outland slave sort took sell big v...   
1250  kharanda spoken possibility agent skordran kir...   
1251  turned bowed two men white cloak slave noble l...   
1252  modifier wrong place youre chief duplicate par...   
1253  longlived race thin nose narrow bitter mouth l... 