**Multiclass classifier**

In [3]:
# Dataset

from google.colab import drive
from sklearn.datasets import load_files

drive.mount("/content/drive")

# Raw data (BCC article datasets) obtained from the Insight Project
# http://mlg.ucd.ie/datasets/bbc.html
loaded_data = load_files("/content/drive/My Drive/data/bbc")

raw_dataset, y, y_names = loaded_data.data, loaded_data.target, loaded_data.target_names

print("Number of documents in the dataset:", len(raw_dataset))
print("Labels (automatically generated from subfolder names):")
for label in y_names:
  print("\t", label)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of documents in the dataset: 2225
Labels (automatically generated from subfolder names):
	 business
	 entertainment
	 politics
	 sport
	 tech


In [4]:
# Text preprocessing

import nltk
from nltk.tokenize import regexp_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
nltk.download("stopwords")

dataset = []
stemmer = SnowballStemmer("english")
stopwords_en = stopwords.words("english")

for i in range(0, len(raw_dataset)):
  tokens = regexp_tokenize(str(raw_dataset[i]), r"\w+")
  stems = [stemmer.stem(token) for token in tokens]
  words_no_stopwords = [word for word in stems if word not in stopwords_en]
  document = ' '.join(words_no_stopwords)
  dataset.append(document)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
# Feature extraction (converting text to vectors)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(dataset).toarray()

In [6]:
# Split training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
# Train model

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [8]:
# Model evaluation

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[ 97   0   2   0   3]
 [  2  80   1   1   0]
 [  4   0  73   1   0]
 [  0   0   0 103   0]
 [  0   2   0   0  76]]
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       102
           1       0.98      0.95      0.96        84
           2       0.96      0.94      0.95        78
           3       0.98      1.00      0.99       103
           4       0.96      0.97      0.97        78

    accuracy                           0.96       445
   macro avg       0.96      0.96      0.96       445
weighted avg       0.96      0.96      0.96       445

0.9640449438202248


In [9]:
# Predict unseen data

unseen_sentence = input("Enter a sentence: ")
X_unseen = vectorizer.transform([unseen_sentence]).toarray()
y_unseen = classifier.predict(X_unseen)

print("The pedicted class for that sentence is:", y_names[y_unseen[0]])

Enter a sentence: Messi scores a goal
The pedicted class for that sentence is: sport


In [11]:
# Model serialization

import pickle

with open("multiclass_classifier.pickle", "wb") as pickle_file:
  pickle.dump(classifier, pickle_file)

with open("multiclass_classifier.pickle", "rb") as serialized_model:
  loaded_model = pickle.load(serialized_model)

y_pred2 = loaded_model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2)) 

[[ 97   0   2   0   3]
 [  2  80   1   1   0]
 [  4   0  73   1   0]
 [  0   0   0 103   0]
 [  0   2   0   0  76]]
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       102
           1       0.98      0.95      0.96        84
           2       0.96      0.94      0.95        78
           3       0.98      1.00      0.99       103
           4       0.96      0.97      0.97        78

    accuracy                           0.96       445
   macro avg       0.96      0.96      0.96       445
weighted avg       0.96      0.96      0.96       445

0.9640449438202248
