**Multiclass classifier**

In [None]:
# Importing dataset and labels

from google.colab import drive
from sklearn.datasets import load_files

drive.mount("/content/drive", force_remount=True)

# Raw data (BCC article datasets) obtained from the Insight Project
# http://mlg.ucd.ie/datasets/bbc.html
loaded_data = load_files("/content/drive/My Drive/Colab Notebooks/data/bbc")

# We typically use the variable name "y" for labels (aka classes, categories, or tags)
raw_dataset, y, y_names = loaded_data.data, loaded_data.target, loaded_data.target_names

print("Number of documents in the dataset:", len(raw_dataset))
print("Labels:")
for label in y_names:
  print("\t", label)

Mounted at /content/drive


In [None]:
# Text preprocessing

import nltk
from nltk.tokenize import regexp_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
nltk.download("stopwords")

dataset = []
stemmer = SnowballStemmer("english")
stopwords_en = stopwords.words("english")

for i in range(0, len(raw_dataset)):
  tokens = regexp_tokenize(str(raw_dataset[i]), r"\w+")
  stems = [stemmer.stem(token) for token in tokens]
  words_no_stopwords = [word for word in stems if word not in stopwords_en]
  document = ' '.join(words_no_stopwords)
  dataset.append(document)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Feature extraction (converting text to vectors)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# We typically use the variable name "X" for features
X = vectorizer.fit_transform(dataset).toarray()

In [None]:
# Split training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Training model

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
# Model evaluation

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[ 97   0   2   0   3]
 [  2  80   0   1   1]
 [  3   0  74   1   0]
 [  1   0   0 102   0]
 [  0   1   0   1  76]]
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       102
           1       0.99      0.95      0.97        84
           2       0.97      0.95      0.96        78
           3       0.97      0.99      0.98       103
           4       0.95      0.97      0.96        78

    accuracy                           0.96       445
   macro avg       0.96      0.96      0.96       445
weighted avg       0.96      0.96      0.96       445

0.9640449438202248


In [None]:
# Predict unseen data

unseen_sentence = input("Enter a sentence: ")
X_unseen = vectorizer.transform([unseen_sentence]).toarray()
y_unseen = classifier.predict(X_unseen)

print("The pedicted class for that sentence is:", y_names[y_unseen[0]])

Enter a sentence:Messi scores a goal
The prediction label for that sentence is: sport


In [None]:
# Model serialization

import pickle

with open('multiclass_classifier.pickle', 'wb') as pickle_file:
  pickle.dump(classifier, pickle_file)

with open('multiclass_classifier.pickle', 'rb') as serialized_model:
  loaded_model = pickle.load(serialized_model)

y_pred2 = loaded_model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2)) 