# ***Text Classifiers in Python***
-------------------------

In [1]:
# Toolkits for supervised text classification:
# Scikit-Learn
# NLTK

# ***Naive Bayes Classifiers in Scikit-Learn***
---------------------------

In [3]:
from sklearn import naive_bayes
from sklearn.metrics import f1_score, accuracy_score

In [4]:
# Instantiating a multinomial naive bayes classifier
# We are using Multinomial model instead of Bernoulli model.

multinomial_nbc = naive_bayes.MultinomialNB()
# then
# multinomial_nbc.fit(train_x, train_labels)
# multinomial_nbc.predict(test_x)

# If the train data was labelled,
# One can check how well well has the classifier done
# f1_score(test_labels, prediction, average = "micro")

# ***SVM classifiers in Scikit-Learn***
--------------------------

In [5]:
from sklearn import svm

In [6]:
# linear classifiers work well with text data
# C is the parameter for soft margin.
# Default value for kernels in RBF
# Default for C is 1

svm_classifier = svm.SVC(kernel = "linear", C = 0.1)

# then
# svm_classifier.fit(train_x, train_labels)
# svm_classifier.predict(test_x)

# ***Model Selection***
_____________________________

In [7]:
# 2 phases of supervised learning
# Training phase
# Inference phase

# labelled data -> Split into train_data and validation_data (hold out)
# then unlabelled test data -> used for prediction
# one could use a labelled test data (data from the hold out without the labels)
# typically the test dataset is not labelled.

In [8]:
# One can use the train data with labels to test how well models are performing by comparing the predictions to the hold out data.
# Or to tune model parameters.

# e.g. To decide which value if best for C

In [9]:
# Two ways of model selection.

# One -> having a hold out data to measure performance
# Two -> cross validation

In [10]:
from sklearn import model_selection

# train_x, test_x, train_labs, test_labs = model_selection.train_test_split(train_data, test_size = 0.35, random_state = 0)
# test_size is the fraction of the whole data to be used as test data.
# random_state is used to shuffle the data before splitting.

# In doing this, one loses a significant portion of the training data to hold out.
# Including this hold out in the training data might improve the model's performance.
# The hold out is of no use to the model but to us
# It is used exclusively to measure model performace or to tune model parameters.

In [None]:
# Cross validation.