# Multi-Class and Multi-Label Classification
*Dr. Samah Fodeh*

# Multi-Class Classification
- One-vs-All
- One-vs-One

In [1]:
# Prepare the Data

# import newsGroups data
import sys
import os
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='train',
                          categories=('rec.autos',
                             'rec.sport.hockey',
                             'sci.med',
                             'sci.space'
                                     ),
                          remove=('headers', 'footers', 'quotes'))

#generate term frequency matrix
from sklearn.feature_extraction.text import CountVectorizer
tf_vec = CountVectorizer (max_df=500,
                      min_df=0,
                      max_features =300,
                      ngram_range =(1,1),
                     stop_words='english')

tf_matrix=tf_vec.fit_transform(news.data)  #sparse matrix
print ("the data has %d rows and %d columns " % (tf_matrix.shape[0], tf_matrix.shape[1]))

import pandas as pd             #conver to full matrix
full_matrix = pd.DataFrame(tf_matrix.todense(),columns=tf_vec.get_feature_names_out())


the data has 2381 rows and 300 columns 


##ONE-vs-ALL

In [2]:
#One-vs-All (one-vs-Rest)
import numpy as np
t=np.asarray(news.target)   # true labels
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(full_matrix,t,random_state=50)

from sklearn.naive_bayes import GaussianNB as NB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

clf= LinearSVC(random_state=10)
#clf = NB()

y_pred = OneVsRestClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-All --> number of mislabels out of %d points in the test test: %d" % (xtest.shape[0],error ))




One-vs-All --> number of mislabels out of 596 points in the test test: 136


## ONE-vs-ONE

In [3]:
# One-vs-One (All-vs-All)

from sklearn.multiclass import OneVsOneClassifier
clf= LinearSVC(random_state=10)
#clf = NB()

y_pred = OneVsOneClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-One --> number of mislabels out of %d points in the test test: %d" % (xtest.shape[0],error ))

One-vs-One --> number of mislabels out of 596 points in the test test: 145


# Multi-Label Classification

##The three types of Problem Transformation for multi-label classification are:


1.  **Binary Relevance**
2.  **Classifier Chain**
3.  **Label Powerset**


## install the multi-label classification package

In [4]:
pip install scikit-multilearn

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


## Binary Relevance

In [9]:
full_matrix = pd.DataFrame(tf_matrix.todense(),columns=tf_vec.get_feature_names_out())

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(full_matrix,t,train_size = .7,random_state=50)

print (x_train.shape)
print(y_train.shape)
print (x_test.shape)
print(y_test.shape)


(1666, 300)
(1666,)
(715, 300)
(715,)


In [12]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

acc = accuracy_score(y_test,predictions.toarray())
print("accuracy of binary relevance is %2.2f " % (acc))

accuracy of binary relevance is 0.67 




## Classifier Chain

In [14]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB

# initialize classifier chains multi-label classifier
# with a gaussian naive bayes base classifier
classifier = ClassifierChain(GaussianNB())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

acc = accuracy_score(y_test,predictions.toarray())
print("accuracy of chain classifier is %2.2f " % (acc))

accuracy of chain classifier is 0.67 


# Label Powerset

In [16]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(GaussianNB())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

acc = accuracy_score(y_test,predictions.toarray())
print("accuracy of label powerset is %2.2f " % (acc))

accuracy of label powerset is 0.44 


