# Classification
## Introduction
In the previous *vectorization* notebook, we presented some ways of vectorizing texts. Now we will show some ways of classifying them based on those vectorizations.

First we import the libraries and declare the paths we will need:

In [62]:
import re, string
import pandas as pd
from numpy import mean
from os.path import join
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from spacy.lang.en.stop_words import STOP_WORDS
from random import choice
from warnings import simplefilter

simplefilter(action='ignore', category=FutureWarning)
df_path = join(*['..', '..', 'data', 'cards-tags', 'tagged_cards.csv'])
N_EXPERIMENTS = 1000

### Loading the data
We load the dataframe containing the tagged cards with their features:

In [52]:
cards_df = pd.read_csv(df_path)

print(f'{cards_df.shape[0]} rows in cards_df, columns:\n{cards_df.columns.values}\n')
cards_df.head()

3334 rows in cards_df, columns:
['name' 'oracle_text' 'oracleid' 'tag' 'type_line']



Unnamed: 0,name,oracle_text,oracleid,tag,type_line
0,Abomination of Gudul,Flying\nWhenever Abomination of Gudul deals co...,3d98af5f-7a0b-4a5a-b3e4-f3c9d150c993,discard-outlet,Creature — Horror
1,Academy Elite,Academy Elite enters the battlefield with X +1...,ba6c3c72-c014-45c6-a0b4-59eb9a65303e,discard-outlet,Creature — Human Wizard
2,Academy Raider,Intimidate (This creature can't be blocked exc...,75131d75-0703-44d0-b503-35190be8e66f,discard-outlet,Creature — Human Warrior
3,Akoum Flameseeker,"Cohort — {T}, Tap an untapped Ally you control...",efae637f-3232-46f2-9839-f3386e2f447d,discard-outlet,Creature — Human Shaman Ally
4,"Alexi, Zephyr Mage","{X}{U}, {T}, Discard two cards: Return X targe...",3f60de36-ed63-4d08-a012-fc16e91da46d,discard-outlet,Legendary Creature — Human Spellshaper


### Encoding the *tag*s
The *tag*s are the label of the cards, the target value for our classification tasks. This is a categorical variable and there are various ways to encode it, but we can most probably stay by a simple label encoding:

In [53]:
le = LabelEncoder()
cards_df['label'] = le.fit_transform(cards_df['tag'])

cards_df.head(10)

Unnamed: 0,name,oracle_text,oracleid,tag,type_line,label
0,Abomination of Gudul,Flying\nWhenever Abomination of Gudul deals co...,3d98af5f-7a0b-4a5a-b3e4-f3c9d150c993,discard-outlet,Creature — Horror,0
1,Academy Elite,Academy Elite enters the battlefield with X +1...,ba6c3c72-c014-45c6-a0b4-59eb9a65303e,discard-outlet,Creature — Human Wizard,0
2,Academy Raider,Intimidate (This creature can't be blocked exc...,75131d75-0703-44d0-b503-35190be8e66f,discard-outlet,Creature — Human Warrior,0
3,Akoum Flameseeker,"Cohort — {T}, Tap an untapped Ally you control...",efae637f-3232-46f2-9839-f3386e2f447d,discard-outlet,Creature — Human Shaman Ally,0
4,"Alexi, Zephyr Mage","{X}{U}, {T}, Discard two cards: Return X targe...",3f60de36-ed63-4d08-a012-fc16e91da46d,discard-outlet,Legendary Creature — Human Spellshaper,0
5,Ancient Excavation,Draw cards equal to the number of cards in you...,46db3dc2-9eb1-4be3-96f9-0013119fcd97,discard-outlet,Instant,0
6,Anje Falkenrath,"Haste\n{T}, Discard a card: Draw a card.\nWhen...",4dab6a96-4376-4aea-983d-406167993214,discard-outlet,Legendary Creature — Vampire,0
7,Anurid Brushhopper,Discard two cards: Exile Anurid Brushhopper. R...,bdecf59c-f8bd-4003-9058-685b70c08837,discard-outlet,Creature — Frog Beast,0
8,Apocalypse,Exile all permanents. You discard your hand.,82c8f5dd-563d-4fd0-bd43-7ee2001d3777,sweeper,Sorcery,5
9,Aquamoeba,Discard a card: Switch Aquamoeba's power and t...,55de434c-0fb9-4127-81b6-47c1c0b088fe,discard-outlet,Creature — Elemental Beast,0


Then when we have prediction on the labels we can use `le.inverse_transform(predictions)` to get the corresponding *tag* values

## testing different hyperparameters for classifying with tf-idf
Should we remove the stop words or not? Use only *oracle_text* or also *name* and *tag*? Which hyperparameters for `TfidfTransformer` work best? In order to test it out, we create a method taking random values for all thos hyperparameters, train a pipeline based on them, registers the performance.

But first we preprocess the card texts to get the columns we will need for this: *all_text* for the concatenation of all texts on the cards, and suffix *_filter* for the text columns where the *STOP_WORDS* have been filtered out:

In [54]:
cards_df.loc[:, 'all_text'] = cards_df.apply(lambda x: f"{x['name']} {x['type_line']} {x['oracle_text']}", axis=1)                                       
                                             

In [59]:
def normalize(text):
    text = re.split(r'\W+', text)
    table = str.maketrans('', '', string.punctuation)
    text = [word.translate(table) for word in text]
    text = ' '.join([word.lower() for word in text if word != ''])
    return text

def filter_stop_words(text):
    text = re.split(r'\W+', text)
    text = ' '.join([word.lower() for word in text if word not in STOP_WORDS])
    return text


for col in ['all_text', 'oracle_text']:
    cards_df[col] = cards_df[col].apply(lambda x: normalize(x))
    cards_df[f'{col}_filtered'] = cards_df[col].apply(lambda x: filter_stop_words(x))

    
cards_df = cards_df.dropna()
cards_df.head()                                    

Unnamed: 0,name,oracle_text,oracleid,tag,type_line,label,all_text,all_text_filtered,oracle_text_filtered
0,Abomination of Gudul,flying whenever abomination of gudul deals com...,3d98af5f-7a0b-4a5a-b3e4-f3c9d150c993,discard-outlet,Creature — Horror,0,abomination of gudul creature horror flying wh...,abomination gudul creature horror flying abomi...,flying abomination gudul deals combat damage p...
1,Academy Elite,academy elite enters the battlefield with x 1 ...,ba6c3c72-c014-45c6-a0b4-59eb9a65303e,discard-outlet,Creature — Human Wizard,0,academy elite creature human wizard academy el...,academy elite creature human wizard academy el...,academy elite enters battlefield x 1 1 counter...
2,Academy Raider,intimidate this creature can t be blocked exce...,75131d75-0703-44d0-b503-35190be8e66f,discard-outlet,Creature — Human Warrior,0,academy raider creature human warrior intimida...,academy raider creature human warrior intimida...,intimidate creature t blocked artifact creatur...
3,Akoum Flameseeker,cohort t tap an untapped ally you control disc...,efae637f-3232-46f2-9839-f3386e2f447d,discard-outlet,Creature — Human Shaman Ally,0,akoum flameseeker creature human shaman ally c...,akoum flameseeker creature human shaman ally c...,cohort t tap untapped ally control discard car...
4,"Alexi, Zephyr Mage",x u t discard two cards return x target creatu...,3f60de36-ed63-4d08-a012-fc16e91da46d,discard-outlet,Legendary Creature — Human Spellshaper,0,alexi zephyr mage legendary creature human spe...,alexi zephyr mage legendary creature human spe...,x u t discard cards return x target creatures ...


In [63]:
def sample_random_hyperparameters():
    text = choice(['all_text', 'oracle_text'])
    filter_stop_words = choice([True, False])
    # Chosing random bow random hyperparameters
    ngram_range = choice([(1, 1), (1, 2), (2, 2)])
    # Chosing random tf-idf random hyperparameters
    norm = choice(['l1', 'l2'])
    use_idf, smooth_idf, sublinear_tf = choice([True, False]), choice([True, False]), choice([True, False])
    # Chosing a classifier
    clf = choice([
        RandomForestClassifier(),
        LinearSVC(),
        MultinomialNB(),
        LogisticRegression()
    ])
    return {'text': text, 'filter_stop_words': filter_stop_words, 'ngram_range': ngram_range, 'norm': norm, 
            'use_idf': use_idf, 'smooth_idf': smooth_idf, 'sublinear_tf': sublinear_tf, 'clf': clf}


def build_pipeline(hyperparameters):
    count_vectorizer = CountVectorizer(ngram_range=hyperparameters['ngram_range'])
    tf_transformer = TfidfTransformer(norm=hyperparameters['norm'], 
                                      use_idf=hyperparameters['use_idf'], 
                                      smooth_idf=hyperparameters['smooth_idf'], 
                                      sublinear_tf=hyperparameters['sublinear_tf'])
    return Pipeline([('vect', count_vectorizer), ('tfidf', tf_transformer), ('clf',hyperparameters['clf'])])


experiments = []
for _ in range(N_EXPERIMENTS):
    hyperparameters = sample_random_hyperparameters()
    text_col = f"{hyperparameters['text']}_filtered" if hyperparameters['filter_stop_words'] else f"{hyperparameters['text']}"
    X, y = cards_df[text_col], cards_df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7)
    pipeline = build_pipeline(hyperparameters)
    pipeline.fit(X_train, y_train)
    y_predict = pipeline.predict(X_test)
    hyperparameters['perf'] = mean(y_predict == y_test)
    experiments.append(hyperparameters)

In [65]:
max_perf = max([experiment['perf'] for experiment in experiments])

In [66]:
print([experiment for experiment in experiments if experiment['perf'] == max_perf])

[{'text': 'oracle_text', 'filter_stop_words': False, 'ngram_range': (1, 2), 'norm': 'l2', 'use_idf': False, 'smooth_idf': True, 'sublinear_tf': True, 'clf': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0), 'perf': 0.9309718437783833}, {'text': 'oracle_text', 'filter_stop_words': False, 'ngram_range': (1, 2), 'norm': 'l2', 'use_idf': False, 'smooth_idf': True, 'sublinear_tf': True, 'clf': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0), 'perf': 0.9309718437783833}, {'text': 'oracle_text', 'filter_stop_words': False, 'ngram_range': (1, 2), 'norm': 'l2', 'use_idf': False, 'smooth_idf': True, 'sublinear_tf': True, 'clf': LinearSVC(C