# ***Text Classifiers in Python***
-------------------------

In [1]:
# Toolkits for supervised text classification:
# Scikit-Learn
# NLTK

# ***Naive Bayes Classifiers in Scikit-Learn***
---------------------------

In [9]:
import nltk
from sklearn import naive_bayes
from sklearn.metrics import f1_score, accuracy_score

In [4]:
# Instantiating a multinomial naive bayes classifier
# We are using Multinomial model instead of Bernoulli model.

multinomial_nbc = naive_bayes.MultinomialNB()
# then
# multinomial_nbc.fit(train_x, train_labels)
# multinomial_nbc.predict(test_x)

# If the train data was labelled,
# One can check how well well has the classifier done
# f1_score(test_labels, prediction, average = "micro")

# ***SVM classifiers in Scikit-Learn***
--------------------------

In [3]:
from sklearn import svm

In [4]:
# linear classifiers work well with text data
# C is the parameter for soft margin.
# Default value for kernels in RBF
# Default for C is 1

svm_classifier = svm.SVC(kernel = "linear", C = 0.1)

# then
# svm_classifier.fit(train_x, train_labels)
# svm_classifier.predict(test_x)

# ***Model Selection***
_____________________________

In [7]:
# 2 phases of supervised learning
# Training phase
# Inference phase

# labelled data -> Split into train_data and validation_data (hold out)
# then unlabelled test data -> used for prediction
# one could use a labelled test data (data from the hold out without the labels)
# typically the test dataset is not labelled.

In [8]:
# One can use the train data with labels to test how well models are performing by comparing the predictions to the hold out data.
# Or to tune model parameters.

# e.g. To decide which value if best for C

In [9]:
# Two ways of model selection.

# One -> having a hold out data to measure performance
# Two -> cross validation

In [2]:
from sklearn import model_selection

# train_x, test_x, train_labs, test_labs = model_selection.train_test_split(train_data, test_size = 0.35, random_state = 0)
# test_size is the fraction of the whole data to be used as test data.
# random_state is used to shuffle the data before splitting.

# In doing this, one loses a significant portion of the training data to hold out.
# Including this hold out in the training data might improve the model's performance.
# The hold out is of no use to the model but to us
# It is used exclusively to measure model performace or to tune model parameters.

In [1]:
# Cross validation.

In [5]:
# A 5:4 cross validation will look something like this.
# Where the training data gets split into 5 parts (called folds)
# 5 models are trained each with one fold as a holdout and providing all the remaining 4 folds as training data
# e.g. train on parts 0 - 3 and test on 4 -> model 1
# train on parts 1 - 4 and test on 0 -> model 2
# et.c

# Every portion serves as a test and train (not simultaneously).
# There are 5 ways of splitting the data.
# Then avarage out the 5 results on the hold out data to evaluate the model's performance.
# Its a common parctice to use 10 fold cross validation.

# train_x, test_x, train_labs, test_labs = model_selection.train_test_split(train_data, random_state = 0, test_size = 0.25)
# model_selection.cross_val_predict(svm_classifier, train_x, train_labs, cv = 10)

# It is also very common to run cross-validation multiple times to minimize variance in the results.

# ***Supervised Text Classification in NLTK***
---------------------------------

In [6]:
# NLTK has some text classification algorithms.
# Naive Bayes Classifier
# Decision Tree Classifier
# Conditional Exponential Classifier
# Maxent Classifier
# Weka Classifier
# NLTK Classifier

In [7]:
from nltk import NaiveBayesClassifier
nbc = NaiveBayesClassifier.train(train_x)

# If the is just one test dataset 
nbc.classify(test_x)

# If there are many test datasets
nbc.classify_many(test_x1, test_x2 ... test_xn)

In [None]:
# Accuracy of model performance

nltk.classify.util.accuracy(nbc, test_x)

In [None]:
# Get the labels

nbc.labels()

In [None]:
# Top few features in the classification task.
# Particularly useful with NBCs

nbc.show_most_informative_features()

# ***Interfacing Scikit-Learn classifiers from NLTK***

In [12]:
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import (MultinomialNB, BernoulliNB)
from sklearn.svm import SVC

In [None]:
# The NLTK function is essentially a wrapper around Sklearn procedures.

bnli_classifier = SklearnClassifier(BernoulliNB()).train(train_x)

In [None]:
# arguments to SVC() is passed to it's caller function.

svm_classifier = SklearnClassifier(SVC(), kernel = "linear", C = 0.5).train(train_x)