In [2]:
# load text
filename = '/kaggle/input/bbc-full-text-document-classification/bbc_data.csv'
file = open(filename, 'rt')
text = file.read()
file.close()

In [3]:
# Import necessary modules

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def preprocess_text(tokens):
    # Remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    
    # Filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    
    return tokens

In [5]:
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
 # load doc
 doc = load_doc(filename)
 # clean doc
 tokens = clean_doc(doc)
 # update counts
 vocab.update(tokens)

In [9]:
# load all docs in a directory
def process_docs(directory, vocab, is_train):
    """
    Load documents from a directory based on whether they are for training or not.

    Args:
    - directory: Path to the directory containing documents.
    - vocab: Set or dictionary to store vocabulary.
    - is_train: Boolean indicating whether the documents are for training or not.
    """
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set if is_train is True
        if is_train and filename.startswith('cv9'):
            continue
        # skip any reviews not in the test set if is_train is False
        if not is_train and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = os.path.join(directory, filename)
        # add doc to vocab
        add_doc_to_vocab(path, vocab)

In [14]:
from collections import Counter
import pandas as pd

# Function to load documents from a CSV file into a vocabulary
def process_docs_from_csv(csv_file, text_column, vocab, is_train):
    """
    Load documents from a CSV file and add them to a vocabulary.

    Args:
    - csv_file: Path to the CSV file containing text data.
    - text_column: Name of the column containing text data.
    - vocab: Counter object to store vocabulary.
    - is_train: Boolean indicating whether the documents are for training or not.
    """
    # Read CSV file using pandas
    data = pd.read_csv(csv_file)

    # Iterate through each row in the DataFrame
    for index, row in data.iterrows():
        # Extract text from the specified column
        text = row[text_column]

        # Tokenize the text (assuming words are separated by whitespace)
        tokens = text.split()

        # Update the vocabulary with the tokens
        vocab.update(tokens)

# Define vocab
vocab = Counter()

# Path to the CSV file and the column containing text data
csv_file = '/kaggle/input/bbc-full-text-document-classification/bbc_data.csv'
text_column = 'data'

# Add all docs to vocab
process_docs_from_csv(csv_file, text_column, vocab, True)

# Print the size of the vocab
print("Size of the vocabulary:", len(vocab))

# Print the top words in the vocab
print("Top 50 words in the vocabulary:")
print(vocab.most_common(50))


Size of the vocabulary: 64151
Top 50 words in the vocabulary:
[('the', 44485), ('to', 24800), ('of', 19833), ('and', 17904), ('a', 17139), ('in', 16487), ('for', 8637), ('is', 8448), ('that', 7528), ('The', 7196), ('on', 7148), ('was', 5992), ('be', 5740), ('with', 5107), ('has', 4933), ('said', 4900), ('it', 4860), ('have', 4718), ('as', 4652), ('will', 4398), ('at', 4386), ('by', 4363), ('are', 4332), ('he', 4219), ('from', 3473), ('not', 3329), ('-', 3195), ('Mr', 2979), ('his', 2825), ('an', 2663), ('but', 2609), ('its', 2603), ('would', 2557), ('had', 2553), ('which', 2551), ('been', 2464), ('they', 2432), ('their', 2347), ('were', 2256), ('I', 2246), ('more', 2138), ('this', 2126), ('also', 2098), ('who', 2021), ('up', 1825), ('about', 1723), ('we', 1709), ('people', 1695), ('than', 1601), ('or', 1596)]


In [22]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from numpy import array

# Load the CSV file
csv_file = '/kaggle/input/bbc-full-text-document-classification/bbc_data.csv'
data = pd.read_csv(csv_file)

# Extract text data from the 'data' column
train_docs = data['data'].tolist()

# Create a tokenizer object
tokenizer = Tokenizer()
# Fit the tokenizer on the training documents
tokenizer.fit_on_texts(train_docs)
# Encode the training documents to sequences of integers
encoded_docs = tokenizer.texts_to_sequences(train_docs)

# Pad sequences to ensure uniform length
max_length = max([len(s.split()) for s in train_docs])  # Find the maximum length
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')  # Pad sequences

# Define training labels
# Assuming 'label' column contains the labels for each document
ytrain = data['labels'].values


In [23]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

In [30]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['data'])
word_index = tokenizer.word_index

# Initialize LabelEncoder
le = LabelEncoder()
data['target'] = le.fit_transform(data['data'])

# Define problem
vocab_size = len(word_index) + 1
max_length = 100

# Pad sequences
padded_sequences = pad_sequences(tokenizer.texts_to_sequences(data['data']), maxlen=max_length)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=8))  # Removed input_length parameter
model.add(Flatten())
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(padded_sequences, data['target'], epochs=10, batch_size=32, validation_split=0.2)

# Summarize the model
print(model.summary())

Epoch 1/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0011 - loss: -113.9706 - val_accuracy: 0.0000e+00 - val_loss: -1839.3181
Epoch 2/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.6602e-04 - loss: -5070.2280 - val_accuracy: 0.0000e+00 - val_loss: -25534.9863
Epoch 3/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0505e-04 - loss: -38751.1523 - val_accuracy: 0.0000e+00 - val_loss: -95461.6641
Epoch 4/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.6602e-04 - loss: -118854.4141 - val_accuracy: 0.0000e+00 - val_loss: -212611.2656
Epoch 5/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0505e-04 - loss: -243019.2188 - val_accuracy: 0.0000e+00 - val_loss: -373628.9375
Epoch 6/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 2.4946e-04 - loss:

None


In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Read the CSV file
data = pd.read_csv('/kaggle/input/bbc-full-text-document-classification/bbc_data.csv')

# Handle NaN values
data.dropna(inplace=True)  # Remove rows with NaN values

# Split the data into features (X) and target (y)
X = data['data']
y = data['labels']  # Assuming 'labels' column contains the target labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_vec, y_train)

# Evaluate the classifier
y_pred = svm_classifier.predict(X_test_vec)
print(classification_report(y_test, y_pred))


               precision    recall  f1-score   support

     business       0.95      0.95      0.95       103
entertainment       0.99      0.96      0.98        84
     politics       0.93      0.97      0.95        80
        sport       1.00      0.97      0.98        98
         tech       0.96      0.97      0.97        80

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445



In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder


# Read the CSV file
data = pd.read_csv('/kaggle/input/bbc-full-text-document-classification/bbc_data.csv')

# Handle NaN values
data.dropna(inplace=True)  # Remove rows with NaN values

# Split the data into features (X) and target (y)
X = data['data']
y = data['labels']  # Assuming 'labels' column contains the target labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model
model.fit(X_train_padded, y_train_encoded, epochs=10, verbose=2)

# Evaluate
loss, acc = model.evaluate(X_test_padded, y_test_encoded, verbose=0)
print('Test Accuracy: %.2f%%' % (acc * 100))

Epoch 1/10
56/56 - 1s - 19ms/step - accuracy: 0.1753 - loss: 0.3762
Epoch 2/10
56/56 - 0s - 4ms/step - accuracy: 0.1697 - loss: -4.4992e+00
Epoch 3/10
56/56 - 0s - 4ms/step - accuracy: 0.1697 - loss: -2.4852e+01
Epoch 4/10
56/56 - 0s - 4ms/step - accuracy: 0.1697 - loss: -7.2474e+01
Epoch 5/10
56/56 - 0s - 4ms/step - accuracy: 0.1697 - loss: -1.4930e+02
Epoch 6/10
56/56 - 0s - 4ms/step - accuracy: 0.1697 - loss: -2.5146e+02
Epoch 7/10
56/56 - 0s - 4ms/step - accuracy: 0.1697 - loss: -3.7642e+02
Epoch 8/10
56/56 - 0s - 5ms/step - accuracy: 0.1697 - loss: -5.2540e+02
Epoch 9/10
56/56 - 0s - 4ms/step - accuracy: 0.1697 - loss: -6.9703e+02
Epoch 10/10
56/56 - 0s - 6ms/step - accuracy: 0.1697 - loss: -8.8720e+02
Test Accuracy: 18.88%


 18.88% were classified correctly by the model.

In [48]:
# Extracting sentences from the 'data' column
sentences = data['data'].tolist()

print('Total training sentences: %d' % len(sentences))

Total training sentences: 2225
