# Interview Text Classification (NLP)

## Imports

In [62]:
import fasttext
import pandas as pd
import numpy as np
# from sklearn.model_selection import train_test_split

## Pre-processing Data
Impute 0 where __Choice__ is NA. Prepend "\_\_label\_\_" to the __Choice__ column to be read by the supervised learner. Random sample 300 observations of each class. Split the data into 75% training and 25% validation. Subset the data to include only __Choice__ and __Question__. Save the subsetted data to corresponding CSV files.

In [63]:
textData = pd.read_csv("questionset2.csv")
textData["Question"].replace("", np.nan, inplace=True)
textData["Choice"] = textData["Choice"].fillna("__label__0")
textData.loc[textData.Choice == 1, "Choice"] = "__label__1"
textData.dropna(subset=["Question"], inplace=True)

label0 = textData[textData["Choice"] == "__label__0"].sample(n = 300, random_state = 1, replace = False)
label1 = textData[textData["Choice"] == "__label__1"].sample(n = 300, random_state = 2, replace = False)

sampled = pd.concat([label0, label1])

index = np.random.rand(len(sampled)) < .75

train = sampled[index][["Choice", "Question"]]
test = sampled[~index][["Choice", "Question"]]

train.to_csv("question_train.txt")
test.to_csv("question_test.txt")

Remove the commas, indicies, quotation marks from CSV files and save to txt file.

In [64]:
%%bash
tr ',' ' ' < question_train.txt > qt.txt && mv qt.txt question_train.txt
cut -f 2- -d ' ' question_train.txt > qt.txt && mv qt.txt question_train.txt
tr -d '"' < question_train.txt > qt.txt && mv qt.txt question_train.txt

tr ',' ' ' < question_test.txt > qt.txt && mv qt.txt question_test.txt
cut -f 2- -d ' ' question_test.txt > qt.txt && mv qt.txt question_test.txt
tr -d '"' < question_test.txt > qt.txt && mv qt.txt question_test.txt

## Training
Train the model using the processed training set. 

In [65]:
initial_model = fasttext.train_supervised(input = "question_train.txt")

#### Sample Prediction

In [66]:
text = "If there are 10 seats on a plane  and you are the 10th person to enter the plane  what are the chances you sit in the correct seat?"
initial_model.predict(text)

(('__label__0',), array([0.50216097]))

Classified as 1.

#### Initial Model Evaluation

In [67]:
initial_model.test("question_test.txt")

(159, 0.5786163522012578, 0.5786163522012578)

(Samples: 159, Precision: 0.534, Recall: 0.534)

## Parameter Tuning

In [68]:
model = fasttext.train_supervised(input = "question_train.txt", autotuneValidationFile = "question_test.txt")

#### Sample Prediction

In [69]:
text = "If there are 10 seats on a plane  and you are the 10th person to enter the plane  what are the chances you sit in the correct seat?"
model.predict(text)

(('__label__1',), array([0.9995327]))

#### Tuned Model Evaluation

In [71]:
model.test("question_test.txt")

(159, 0.8679245283018868, 0.8679245283018868)

(Samples: 159, Precision: 0.867, Recall: 0.867)