# Auto Labeller
This notebook allows you to test the application of a semi-automated labeller for your document.

In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import pandas as pd
import numpy as np

from src.toolkit.plot import *
from src.toolkit.autolabel import *

from sklearn.naive_bayes import MultinomialNB

## Data Input
* corpus_path: path to csv file containing text data **sample format shown below**
    * tabular format with the first column containing the short text to be labelled
* labels_path: path to file containing labels for each category. **sample format shown below**
    * tabular format with each columns listing words related to each category
    * do note that in this implementation these words need to exist in the data
* enriched_labels_path: path to output enriched labels
* labelled_path: path to output labelled dataset

In [3]:
corpus_path = "data/movies500.csv"
labels_path = "data/labels.csv"
enriched_labels_path = "data/enriched_labels.csv"
labelled_path = "data/movies500_labelled.csv"
stopwords_path = "data/stopwords.csv"
text_column_name = "overview"

data = pd.read_csv(corpus_path)
data = data[[text_column_name]]
labels = pd.read_csv(labels_path)

In [4]:
data.head(5)

Unnamed: 0,overview
0,A family wedding reignites the ancient feud be...
1,"Cheated on, mistreated and stepped on, the wom..."
2,"Obsessive master thief, Neil McCauley leads a ..."
3,An ugly duckling having undergone a remarkable...
4,"A mischievous young boy, Tom Sawyer, witnesses..."


In [5]:
labels.head(5)

Unnamed: 0,Action,Romance,Science Fiction,Thriller,War,Western
0,terrorist,college,science,murder,ii,criminal
1,drug,girlfriend,,criminal,war,
2,operation,french,,crime,,
3,cia,romantic,,detective,,
4,undercover,marry,,killer,,


## Data Preprocessing

* Prepare text. Some operations are removng special characters, digits and stopwords etc. 
* Join bigrams that appear frequently into one word.

In [6]:
corpus = data[text_column_name]

# Text Preprocessing
preprocessed_corpus = corpus_preprocess(corpus=corpus, stopwords_path=stopwords_path)

# Replace bigrams    
data['overview'] = corpus_replace_bigrams(min_df=50, max_df=500, corpus=preprocessed_corpus)

## Model Training
* Identifies key labels and enriches dictionary
* **Note** What happens here is due to the skewed training set where there are limited SF etc.

In [7]:
labels.head(10)

Unnamed: 0,Action,Romance,Science Fiction,Thriller,War,Western
0,terrorist,college,science,murder,ii,criminal
1,drug,girlfriend,,criminal,war,
2,operation,french,,crime,,
3,cia,romantic,,detective,,
4,undercover,marry,,killer,,


In [8]:
autoLabeller = AutoLabeller(labels, corpus, data)
enriched_labels = autoLabeller.train()

enriched_labels.to_csv(enriched_labels_path)
enriched_labels.head(10)

Unnamed: 0,Action,Romance,Science Fiction,Thriller,War,Western
0,assignment,attend,terrorist,criminal,across,assignment
1,terrorist,challenge,washington,cia,bodyguard,terrorist
2,clean,girlfriend,demand,dealer,girlfriend,criminal
3,cia,farm,modern,private,farm,cia
4,district,student,comedy,lord,student,private
5,illegal,professional,havoc,track,paris,discovers
6,lord,havoc,science,police,war,identity
7,police,identity,center,powerful,best,best
8,living,science,view,crime,university,war
9,stolen,track,powerful,detective,singer,throw


## Model Application

In [9]:
mnb = MultinomialNB()
ypred = autoLabeller.apply(mnb)

## Output Data
* Csv file containing labels for each row

In [10]:
ypred.to_csv(labelled_path)
ypred.head()

Unnamed: 0,Action,Romance,Science Fiction,Thriller,War,Western
0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0
