# MultiLabel Classification
- Apples and Basketball

---

## Import modules

In [1]:
# Standard
import pandas as pd

# TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

# ML
from sklearn.linear_model import LogisticRegression

# Train-Test split
from sklearn.model_selection import train_test_split

# AST
import ast

# Spacy
import spacy

# Make column transformer
from sklearn.compose import make_column_transformer

# Pipeline
from sklearn.pipeline import make_pipeline


## Import Toy Data

In [2]:
df = pd.read_excel('apples_and_basketball.xlsx')
df.shape

(12, 2)

In [3]:
df

Unnamed: 0,Text,Label
0,I eat apple,['apple']
1,I play basketball,['basketball']
2,I eat apple and play basketball,"['apple', 'basketball']"
3,I will play basketball,['basketball']
4,I will eat apple,['apple']
5,I will eat apple and will play basketball,"['apple', 'basketball']"
6,I like to eat apple and like to play basketball,"['apple', 'basketball']"
7,I like to eat apple,['apple']
8,I like to play basketball,['basketball']
9,I played basketball,['basketball']


## Data Preparation

#### 1. Convert label list imported as strings to lists

In [4]:
type(df['Label'][0])

str

In [5]:
df['label_ast'] = df['Label'].apply(lambda x: ast.literal_eval(x))
type(df['label_ast'][0])

list

In [6]:
df.head(2)

Unnamed: 0,Text,Label,label_ast
0,I eat apple,['apple'],[apple]
1,I play basketball,['basketball'],[basketball]


#### 2. Label Counts

In [7]:
df['label_count'] = df['label_ast'].apply(len)
df.head(3)

Unnamed: 0,Text,Label,label_ast,label_count
0,I eat apple,['apple'],[apple],1
1,I play basketball,['basketball'],[basketball],1
2,I eat apple and play basketball,"['apple', 'basketball']","[apple, basketball]",2


#### 3. Spacy Stopwords

In [8]:
from spacy.cli.download import download
download(model="en_core_web_lg")

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [9]:
nlp_lg = spacy.load('en_core_web_lg')
nlp_lg

<spacy.lang.en.English at 0x1afeb7cd348>

In [10]:
# Spacy stopwords

sp_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Type   : ', type(sp_stopwords))
print('Length : ', len(sp_stopwords))

Type   :  <class 'set'>
Length :  326


#### 4. Vectorize Features using TFIDF

In [11]:
# Will use the spacy stopwords later
tfidf = TfidfVectorizer(stop_words='english')

In [12]:
tfidf.get_params();

In [13]:
# Vectorize the feature column using make_column_transformer 

preprocessor = make_column_transformer((tfidf, "Text"))

## Pipeline

In [14]:
model = make_pipeline(preprocessor)
model.fit(df)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('tfidfvectorizer',
                                                  TfidfVectorizer(stop_words='english'),
                                                  'Text')]))])

In [15]:
preprocessor.named_transformers_.tfidfvectorizer.get_feature_names()

['apple', 'ate', 'basketball', 'eat', 'like', 'play', 'played']

In [16]:
# TDIDF processed array of df which is only the "Text" column 
# that has been preprocessed since default "remainder" 
# in make_column_transformer is set to "drop" and not 
# "passthrough"
tfidf_processed_csr = model.transform(df)

# Creating a df of the csr (just for analysis)
df_processed = pd.DataFrame(tfidf_processed_csr, 
                            columns=preprocessor.named_transformers_.tfidfvectorizer.get_feature_names())
df_processed.shape

(12, 7)

In [18]:
df_processed.head(3)

Unnamed: 0,apple,ate,basketball,eat,like,play,played
0,0.645328,0.0,0.0,0.763905,0.0,0.0,0.0
1,0.0,0.0,0.645328,0.0,0.0,0.763905,0.0
2,0.456316,0.0,0.456316,0.540163,0.0,0.540163,0.0
