<p class='main_title'>NPL with Multilabel Classification<p>
    <hr>

# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import plotly.express as px

# Plot colors
URBAN_PALETTE_CATEGORICAL = pd.DataFrame({
    'cyan'    : '#1696d2',
    'gray'    : '#d2d2d2',
    'magenta' : '#ec008b',
    'yellow'  : '#fdbf11',
    'dark'    : '#332d2f',
    'ocean'   : '#0a4c6a',
}, index=['hex_code'])


Bad key text.latex.preview in file /Users/baiochi/opt/anaconda3/lib/python3.9/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 123 ('text.latex.preview : False')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.5.1/matplotlibrc.template
or from the matplotlib source distribution

Bad key mathtext.fallback_to_cm in file /Users/baiochi/opt/anaconda3/lib/python3.9/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 155 ('mathtext.fallback_to_cm : True  # When True, use symbols from the Computer Modern')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.5.1/matplotlibrc.template
or from the matplotlib source distribution

Bad key savefig.jpeg_quality in file /Users/baiochi/opt/anaconda3/lib/python3.9/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 418 ('savefig.jpeg_quality: 95       # when a jpeg is saved, 

# Analysis

## Reading file

In [2]:
df = pd.read_csv('data/stackoverflow_questions_pt.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5408 entries, 0 to 5407
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Perguntas  5408 non-null   object
 1   Tags       5408 non-null   object
dtypes: object(2)
memory usage: 84.6+ KB


In [3]:
df.head()

Unnamed: 0,Perguntas,Tags
0,Possuo um projeto Node.js porém preciso criar ...,node.js
1,"Gostaria de fazer testes unitários no Node.js,...",node.js
2,Como inverter a ordem com que o jQuery itera u...,jquery
3,Eu tenho uma página onde pretendo utilizar um ...,html
4,Como exibir os dados retornados do FireStore e...,html angular


## Exploratory Data Analysis

### Counting Multilabels

In [225]:
px.bar(
    df['Tags'].value_counts(), 
    title='Value counts for each label combination'
).update_traces(
    marker_color='#1696d2', 
    marker_line_color='rgb(8,48,107)',
    marker_line_width=1.5, 
    opacity=0.8)

### Creating dummies with `MultiLabelBinarizer`

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

tag_counts = pd.DataFrame(
    mlb.fit_transform(df['Tags'].apply(str.split)), 
    columns=mlb.classes_, 
    index=df.index)

tag_counts.head()

Unnamed: 0,angular,html,jquery,node.js
0,0,0,0,1
1,0,0,0,1
2,0,0,1,0
3,0,1,0,0
4,1,1,0,0


In [87]:
# Create Target variable containing each label value
df['Target'] = tag_counts.values.tolist()

### Labels distribuition

In [239]:
labels_sums = tag_counts.sum().rename_axis('label').reset_index(name='count')
labels_sums['label'] = labels_sums['label'].astype('category')
labels_sums

Unnamed: 0,label,count
0,angular,929
1,html,2345
2,jquery,2444
3,node.js,641


In [242]:
px.bar(
    labels_sums, 
    x='label',
    y='count', 
    color='label',
    color_discrete_sequence=URBAN_PALETTE_CATEGORICAL.loc['hex_code'][:4],
    title='Labels distribuition',
    height=600,
    width=900
).update_traces(
    marker_line_color='rgb(8,48,107)',
    marker_line_width=1.5, 
    opacity=0.8
).update_xaxes(type='category')


### Labels correlation

In [189]:
px.imshow(tag_counts.corr(), 
    color_continuous_scale='spectral', 
    range_color=(-1,1),
    title='Label correlation Heatmap'
).update_traces(opacity=0.8)

### Words distribuition

In [165]:
fig = px.histogram(
    df.Perguntas.str.len(),
    nbins=1000, height=500, width=900, 
    color_discrete_sequence=URBAN_PALETTE_CATEGORICAL.loc['hex_code'],
    title='Word counts in Perguntas ')
fig.show();

## Machine learning

In [263]:
def run_pipeline(transformer, estimator, df=df, **t_params):
    
    # Train/test split
    X = df['Perguntas']
    y = df['Tags'].apply(str.split)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Preprocessing target labels
    mlb = MultiLabelBinarizer()
    y_train = mlb.fit_transform(y_train)
    y_test = mlb.fit_transform(y_test)
    
    # Create Pipeline and fit model
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=5000, 
            max_df=0.85) 
        ),
        ('clf_'+estimator.__name__, transformer(estimator(), **t_params)),
    ]).fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)

    # Model accuracy
    train_acc = pipeline.score(X_train, y_train)
    test_acc = pipeline.score(X_test, y_test)
    print(f'Accuracy score for Train dataset: {train_acc*100:.2f}%')
    print(f'Accuracy score for Test dataset:  {test_acc*100:.2f}%')
    
    # Hamming Loss metrics
    hl_score_train = hamming_loss(y_train, y_pred_train)
    hl_score_test = hamming_loss(y_test, y_pred_test)
    print(f'Train dataset Hamming-loss: {hl_score_train:.2f}')
    print(f'Test dataset Hamming-loss:  {hl_score_test:.2f}')
    
    return {
        'pipeline'  : pipeline,
        'accuracy'  : [train_acc, test_acc],
        'hamm-loss' : [hl_score_train, hl_score_test]
    }

### Split data and Preprocessing

In [118]:
# Train/test split
from sklearn.model_selection import train_test_split
X = df['Perguntas']
y = df['Tags'].apply(str.split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing target labels
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.fit_transform(y_test)

### Multinomial Naive Bayes

In [122]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

# Instance estimator
mnb = MultinomialNB()
# Create Pipeline and fit model
mnb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000,
        max_df=0.85,)
    ),
    ('clf_ovr', OneVsRestClassifier(mnb)),
]).fit(X_train, y_train)
# Model score
print(f'Score for Train dataset: {mnb_pipeline.score(X_train, y_train)*100:.2f}%')
print(f'Score for Test dataset:  {mnb_pipeline.score(X_test, y_test)*100:.2f}%')

Score for Train dataset: 47.39%
Score for Test dataset:  28.84%


### Logistic Regression

In [138]:
from sklearn.linear_model import LogisticRegression
# Instance estimator
lr = LogisticRegression()
# Create Pipeline and fit model
lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000, 
        max_df=0.85) 
    ),
    ('clf_ovr', OneVsRestClassifier(lr)),
]).fit(X_train, y_train)
# Model score
print(f'Score for Train dataset: {lr_pipeline.score(X_train, y_train)*100:.2f}%')
print(f'Score for Test dataset:  {lr_pipeline.score(X_test, y_test)*100:.2f}%')

Score for Train dataset: 59.04%
Score for Test dataset:  40.39%


### Hamming-loss metrics for Multilabel problem

In [148]:
from sklearn.metrics import hamming_loss
# Make predictions
y_pred_train = lr_pipeline.predict(X_train)
y_pred_test = lr_pipeline.predict(X_test)
# Hamming Loss metrics
hl_score_train = hamming_loss(y_train, y_pred_train)
hl_score_test = hamming_loss(y_test, y_pred_test)

print(f'Train dataset Hamming-loss: {hl_score_train:.2f}')
print(f'Test dataset Hamming-loss:  {hl_score_test:.2f}')

Train dataset Hamming-loss: 0.12
Test dataset Hamming-loss:  0.19


### Classifier Chain

In [248]:
!pip install scikit-multilearn;

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0
[0m

In [251]:
from skmultilearn.problem_transform import ClassifierChain

# Create Pipeline and fit model
chain_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000, 
        max_df=0.85) 
    ),
    ('clf_chain', ClassifierChain(lr)),
]).fit(X_train, y_train)

# Model score
print(f'Accuracy score for Train dataset: {chain_pipeline.score(X_train, y_train)*100:.2f}%')
print(f'Accuracy score for Test dataset:  {chain_pipeline.score(X_test, y_test)*100:.2f}%')

# Make predictions
y_pred_train = chain_pipeline.predict(X_train)
y_pred_test = chain_pipeline.predict(X_test)
# Hamming Loss metrics
hl_score_train = hamming_loss(y_train, y_pred_train)
hl_score_test = hamming_loss(y_test, y_pred_test)

print(f'Train dataset Hamming-loss: {hl_score_train:.2f}')
print(f'Test dataset Hamming-loss:  {hl_score_test:.2f}')

Accuracy score for Train dataset: 65.35%
Accuracy score for Test dataset:  49.54%
Train dataset Hamming-loss: 0.14
Test dataset Hamming-loss:  0.21


### Binary Relevance

In [264]:
from skmultilearn.problem_transform import BinaryRelevance

br_clf = run_pipeline(transformer=BinaryRelevance, estimator=LogisticRegression)

Accuracy score for Train dataset: 59.04%
Accuracy score for Test dataset:  40.39%
Train dataset Hamming-loss: 0.12
Test dataset Hamming-loss:  0.19


### ML-KNN

In [268]:
from skmultilearn.adapt import MLkNN

# Create Pipeline and fit model
knn_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000, 
        max_df=0.85) 
    ),
    ('mlknn', MLkNN()),
])
knn_pipeline.fit(X_train, y_train)
# Make predictions
y_pred_train = knn_pipeline.predict(X_train)
y_pred_test = knn_pipeline.predict(X_test)
# Model accuracy
train_acc = knn_pipeline.score(X_train, y_train)
test_acc = knn_pipeline.score(X_test, y_test)
print(f'Accuracy score for Train dataset: {train_acc*100:.2f}%')
print(f'Accuracy score for Test dataset:  {test_acc*100:.2f}%')
# Hamming Loss metrics
hl_score_train = hamming_loss(y_train, y_pred_train)
hl_score_test = hamming_loss(y_test, y_pred_test)
print(f'Train dataset Hamming-loss: {hl_score_train:.2f}')
print(f'Test dataset Hamming-loss:  {hl_score_test:.2f}')

TypeError: __init__() takes 1 positional argument but 2 were given

# Discussion