# Dummy multiclass sentence classification

## Split example text (from 'Decline and Fall' by E. Waugh) in sentences and assign one of four random labels to each sentence

In [1]:
import nltk.tokenize
import random
from collections import Counter

In [2]:
with open('/home/sdevert/paul.txt', 'r') as f:
    raw_text = f.read()

In [3]:
tokenized = nltk.tokenize.sent_tokenize(raw_text)

In [4]:
labels = []
for sentence in tokenized:
    label = random.randint(1, 4)
    labels.append(label)

## Check dataset length and the amount of members for each class

In [5]:
len(labels) == len(tokenized)

True

In [6]:
len(labels)

140

In [7]:
c = Counter(labels)

In [8]:
c

Counter({3: 37, 4: 45, 1: 26, 2: 32})

## How it looks like

### This is multiclass, not mutlilabel classification task for simplicity

In [9]:
import pandas as pd

In [10]:
df = pd.DataFrame({'text': tokenized, 'label': labels})

In [11]:
df

Unnamed: 0,text,label
0,"Mr Sniggs, the Junior Dean, and Mr Postlethwa...",3
1,From the rooms of Sir Alastair Digby-Vaine-Tru...,4
2,They alone of the senior members of Scone were...,1
3,The others were all scattered over Boar's Hill...,3
4,It is not accurate to call this an annual even...,2
5,There is tradition behind the Bollinger; it nu...,2
6,"At the last dinner, three years ago, a fox had...",3
7,What an evening that had been!,4
8,"This was the first meeting since then, and fro...",1
9,For two days they had been pouring into Oxford...,4


## Vectorizing data

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [13]:
vectorizer = CountVectorizer()

In [14]:
X = vectorizer.fit_transform(tokenized)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.15, random_state=42)

# Multiclass

## Logreg

### Note that the classes are assigned randomly, therefore results are meant to be bad

In [42]:
model = linear_model.LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial')

In [43]:
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=0, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [48]:
sklearn.__version__

'0.21.2'

In [44]:
y_pred = model.predict(X_test)

In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.11      0.50      0.18         2
           3       0.00      0.00      0.00         7
           4       0.67      0.40      0.50        10

    accuracy                           0.24        21
   macro avg       0.19      0.23      0.17        21
weighted avg       0.33      0.24      0.26        21



# Multilabel

## Random forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
random_forest = RandomForestClassifier(max_depth=2, random_state=0)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.15, random_state=42)

In [30]:
random_forest.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [31]:
y_pred = random_forest.predict(X_test)

In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         7
           4       0.47      0.90      0.62        10

    accuracy                           0.43        21
   macro avg       0.12      0.23      0.16        21
weighted avg       0.23      0.43      0.30        21



  'precision', 'predicted', average, warn_for)


# Other ensemble classification

## Gradient boosting: Multiclass as One-Vs-The-Rest

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

In [34]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

In [35]:
gb.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=1.0, loss='deviance', max_depth=1,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [36]:
y_pred = gb.predict(X_test)

In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.17      0.50      0.25         2
           3       0.11      0.14      0.12         7
           4       0.50      0.20      0.29        10

    accuracy                           0.19        21
   macro avg       0.19      0.21      0.17        21
weighted avg       0.29      0.19      0.20        21

