# Auto Labeller
Applied to news dataset to evaluate model performance
* Uses the full dataset
* Uses the small label dictionary

In [1]:
# Standard Libary Imports
import pandas as pd
import numpy as np
import json

from src.toolkit.autolabel import Preprocessor, AutoLabeller, check_labels
from src.toolkit.autolabel import recommend_words, Evaluator

from sklearn.naive_bayes import MultinomialNB

In [2]:
# file path to text data
text_path = "data/news/news.csv"
labelled_path = "data/news/news_labelled_small_labels.csv"  # INPUT YOUR PREFERED OUTPUT PATH
score_path = "data/news/news_score_small_labels.csv"
labels_path = "data/news/news_labels_small.csv"  # INPUT PATH TO LABELS DICTIONARY

stopwords_path = "data/stopwords.csv"  ## ADJUST IF YOU HAVE CUSTOM STOPWORDS
text_column_name = "content"

news = pd.read_csv(text_path)
data = news[[text_column_name]]

In [3]:
data.head()

Unnamed: 0,content
0,Unions representing workers at Turner Newall...
1,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,AP - A company founded by a chemistry research...
3,AP - It's barely dawn when Mike Fitzpatrick st...
4,AP - Southern California's smog-fighting agenc...


In [4]:
corpus = data[text_column_name]

preprocessor = Preprocessor()

# Text Preprocessing
preprocessed_corpus = preprocessor.corpus_preprocess(corpus=corpus, stopwords_path=stopwords_path)

# Replace bigrams
data[text_column_name] = preprocessor.corpus_replace_bigrams(corpus=preprocessed_corpus, min_df=50, max_df=500)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


## Recommended themes and words

In [5]:
n_words = 20  # CHANGE THE NUMBER OF WORDS RECOMMENDED (IF YOU WANT TO)

# Returns a matrix of recommended words
topic_model, dtm, best_n = recommend_words(corpus) 
topic_model.show_topics(dtm=dtm, best_n=best_n, n_words=n_words)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,kill,people,baghdad,iraq,least,iraqi,bomb,police,wound,car,attack,city,official_say,soldier,injure,insurgent,explode,force,troop,three
1,gt,lt,font,strong,color,size,sans,arial,serif,verdana,helvetica,face,br,washington,http,nobr,em,washingtonpostcom,fff,analysis
2,quot,get,call,go,tell,player,look,word,news,spokesman,merriamwebster,many,like,brother,microsoft,dictionary,see,death,come,hit
3,high,stock,price,oil_price,oil,low,record,crude,barrel,investor,dollar,rise,market,energy,new_reuters,supply,rate,economic,london,concern
4,season,team,victory,lead,sunday,night,run,second,saturday,play,cup,final,point,first,last,champion,championship,beat,score,coach
5,service,software,internet,announce,computer,search,technology,microsoft,use,system,phone,launch,mobile,user,release,version,network,product,business,music
6,lt_gt,lt_href,http_wwwinvestorreuterscomfullquoteaspx,targetstocksquickinfofullquote_gt,new_reuters,inc,co,chicago,quarterly,business,earnings,profit,billion,maker,company_say,would,buy,drug,stock,percent
7,government,united,would,talk,country,european,leader,nuclear,group,state,afp,official,washington,prime_minister,federal,former,could,election,international,nation
8,athens,olympic,gold,medal,american,men,second,greece,woman,meter,champion,united,olympics,basketball,time,become,final,silver,gymnastics,competition
9,israeli,palestinian,gaza,israel,army,kill,militant,leader,military,camp,refugee,fire,prime_minister,jerusalem,troop,settlement,security,rocket,egyptian,border


In [6]:
labels = pd.read_csv(labels_path)
labels = check_labels(news[[text_column_name]], labels)
labels.head(5)  

olympic is not in the input corpus. It is removed from dictionary
iraq is not in the input corpus. It is removed from dictionary


Unnamed: 0,Business,SciTech,Sports,World
0,stock,software,,
1,price,internet,champion,soldier


In [7]:
labels.head(7)

Unnamed: 0,Business,SciTech,Sports,World
0,stock,software,,
1,price,internet,champion,soldier


In [8]:
autoLabeller = AutoLabeller(labels.head(6), corpus, data)
enriched_labels = autoLabeller.train()

enriched_labels  ## Enriched suggested labels

Unnamed: 0,Business,SciTech,Sports,World
0,profit,service,chelsea,military
1,oil_price,user,defeat,insurgent
2,crude,online,defend,wound
3,stock,phone,arsenal,injure
4,average,email,streak,iraqi
5,new_reuters,web,barcelona,militant
6,worry,microsoft,inter,force
7,low,application,captain,kill
8,energy,search,semifinal,shiite
9,sharp,aol,side,baghdad


In [9]:
mnb = MultinomialNB()
ypred = autoLabeller.apply(mnb, 'content')
ypred.to_csv(labelled_path)

In [10]:
evaluator = Evaluator()
score = evaluator.evaluate_predictions(news, ypred, labels, split=0.2, random_state=42)
score.to_csv(score_path, index=False)
score

Unnamed: 0,Business,SciTech,Sports,World
Precision,0.5829,0.5991,0.6657,0.6712
Recall,0.6072,0.6366,0.6041,0.6568
F1-score,0.5948,0.6173,0.6334,0.664


In [11]:
evaluator.compare_to_other_models(score, news, labels)

Unnamed: 0,Automatic Labeling,MLP Neural Network,Gradient Boosted Trees,Random Forest
Precision,0.63,0.804,0.814,0.872
Recall,0.626,0.681,0.454,0.511
F1-score,0.627,0.736,0.583,0.644
