# NLP Classifier to predict the intent / audience of a question

The aim of this is a binary classifier. Given an unseen set of statements and correctly decipher if they are movie questions or stackoverflow posts

## Libraries

In [1]:
import pandas as pd
from sklearn.utils import shuffle
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import f1_score

# Dimensionality Reduction
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

np.random.seed(42)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Colaboratory Specific Code

This mounts your Google Drive folder so you can access it from Colaboratory

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

We also want to change the working directory so that the file references work. Here I've places the files for this class inside a folder called 'class_1' that itself is inside a folder called 'NLP_class'.

In [3]:
# %cd drive/My\ Drive/NLP_class/0719_question
# !ls

## Pre-process the data

### Inspect Data
First we want to load in the data so that we can see what it looks like

In [4]:
dialogues = pd.read_csv('dialogues.tsv', sep='\t')
tagged = pd.read_csv('tagged_posts.tsv', sep='\t')

In [5]:
dialogues.head(2)

Unnamed: 0,text,tag
0,Okay -- you're gonna need to learn how to lie.,dialogue
1,I'm kidding. You know how sometimes you just ...,dialogue


In [6]:
tagged.head(2)

Unnamed: 0,post_id,title,tag
0,9,Calculate age in C#,c#
1,16,Filling a DataSet or DataTable from a LINQ que...,c#


### Create base dataset

Now we can drop the 'tag' columns of both and assign new labels. 

We set a movie question to label 0 and a stack overflow post to label 1

We also need to drop the 'post_id' column of the stack overflow data and rename 'title' to 'text'

In [7]:
movie = dialogues.drop(columns=['tag'])
movie['label'] = int(0)
movie.head(2)

Unnamed: 0,text,label
0,Okay -- you're gonna need to learn how to lie.,0
1,I'm kidding. You know how sometimes you just ...,0


In [8]:
stack = tagged.drop(columns=['post_id','tag'])
stack['label'] = int(1)
stack = stack.rename(columns={"title": "text"})
stack.head(2)

Unnamed: 0,text,label
0,Calculate age in C#,1
1,Filling a DataSet or DataTable from a LINQ que...,1


We now combine the two different data sets into one single data set.

In [9]:
data = movie.append(stack, ignore_index=True)
data.head()

Unnamed: 0,text,label
0,Okay -- you're gonna need to learn how to lie.,0
1,I'm kidding. You know how sometimes you just ...,0
2,Like my fear of wearing pastels?,0
3,I figured you'd get to the good stuff eventually.,0
4,Thank God! If I had to hear one more story ab...,0


### Clean the text

In [10]:
# There is a problem with this codes as it is possible for all words to be removed leaving a null value
# This is solved in line 25 with the if statement

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = re.sub(REPLACE_BY_SPACE_RE, " ", text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = re.sub(BAD_SYMBOLS_RE, "", text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = " " + text + " "
    for sw in STOPWORDS:
        text = text.replace(" "+sw+" ", " ") # delete stopwords from text
    text = re.sub('[ ][ ]+', " ", text)

    #     print("text:", text)
#     print(len(text))
    
    if len(text) > 1: # only run if there are words present
      if text[0] == ' ':
          text = text[1:]
      if text[-1] == ' ':
          text = text[:-1]
        
    return text
  

In [11]:
data.shape

(615477, 2)

Here we can see that certain lines are entirely made up of stopwords and symbols. We need to account for this.

In [12]:
print("This line: ", data.loc[82457, 'text'])
print("\nBecomes: ", text_prepare(data.loc[82457, 'text']))

This line:  Do you have to do that?

Becomes:   


Now we run the cleaning function on the whole database.

In [13]:
%time cleaned_text = [text_prepare(x) for x in data['text']]

Wall time: 32.4 s


In [14]:
clean_text = pd.DataFrame({'clean_data': cleaned_text})
clean_text.head()

Unnamed: 0,clean_data
0,okay youre gonna need learn lie
1,im kidding know sometimes become persona dont ...
2,like fear wearing pastels
3,figured youd get good stuff eventually
4,thank god hear one story coiffure


In [15]:
clean = pd.concat([clean_text, data], axis = 1, ignore_index=True)
clean = shuffle(clean)
clean.head()

Unnamed: 0,0,1,2
378098,syntax error insert statement c# oledb,Syntax error in INSERT INTO statement in c# ol...,1
274212,applying methods object private variables java...,applying methods to object and private variabl...,1
527013,xampp openssl errors calling openssl_pkey_new,xampp openssl errors when calling openssl_pkey...,1
25072,let buyer beware,Let the buyer beware.,0
491370,read write locking confusion,Read/Write locking confusion,1


In [16]:
clean = clean.drop(columns=[1])
clean.head(20)

Unnamed: 0,0,2
378098,syntax error insert statement c# oledb,1
274212,applying methods object private variables java...,1
527013,xampp openssl errors calling openssl_pkey_new,1
25072,let buyer beware,0
491370,read write locking confusion,1
176289,im alright,0
323152,get last segment regular expression,1
588685,wcf service completely locked,1
102626,ask theres nothing new coke,0
380088,complex example project java desktopstyle gui,1


Check that there are only 2 labels (binary) and see the amount of each.

In [17]:
unique_labels = clean.groupby(2).nunique()
unique_labels.head()

Unnamed: 0_level_0,0,2
2,Unnamed: 1_level_1,Unnamed: 2_level_1
0,206948,1
1,394520,1


### Train Test Validation Split

In [18]:
clean_data = clean[0]
clean_data.head()

378098               syntax error insert statement c# oledb
274212    applying methods object private variables java...
527013        xampp openssl errors calling openssl_pkey_new
25072                                      let buyer beware
491370                         read write locking confusion
Name: 0, dtype: object

In [19]:
clean_labels = clean[2]
clean_labels.head()

378098    1
274212    1
527013    1
25072     0
491370    1
Name: 2, dtype: int64

First we split off the validation set as 20% of the overall data set.

In [20]:
df_data, X_val, df_labels, y_val = train_test_split(
    clean_data, clean_labels, test_size=0.2, random_state=42, shuffle=False)

Then we split the remaining into training and test data.

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df_data, df_labels, test_size=0.25,
                                                        random_state=42,
                                                        shuffle=False)

The end result is 60% training data, 20% test data and 20% validation data

In [22]:
print("Shape of X_train", X_train.shape)
print("Shape of y_train", X_train.shape)
print("Shape of X_test", X_test.shape)
print("Shape of y_test", y_test.shape)
print("Shape of X_val", X_val.shape)
print("Shape of y_val", y_val.shape)

Shape of X_train (369285,)
Shape of y_train (369285,)
Shape of X_test (123096,)
Shape of y_test (123096,)
Shape of X_val (123096,)
Shape of y_val (123096,)


### Apply TF-IDF Weighting

We first need to learn the words for the dictionary from our training data.

In [23]:
vectorizer = TfidfVectorizer(norm=u'l1', token_pattern='(\S+)', min_df=5, max_df=0.9, ngram_range=(1,2))

%time vectorizer.fit_transform(X_train)
# print(vectorizer.get_feature_names(10))

Wall time: 8.03 s


<369285x65392 sparse matrix of type '<class 'numpy.float64'>'
	with 2693739 stored elements in Compressed Sparse Row format>

Apply this learned TF-IDF transform to our different dataframes

In [24]:
%time train_tf = vectorizer.transform(X_train)
%time test_tf = vectorizer.transform(X_test)
%time val_tf = vectorizer.transform(X_val)

Wall time: 3.98 s
Wall time: 1.3 s
Wall time: 1.3 s


We can see that the different dataframes have been transformed intt Compressed Sparse Row format data. This is very important as sparsity has a big impact on accuracy.

In [25]:
print("Shape of train_tf", train_tf.shape)

print("Shape of test_tf", test_tf.shape)

print("Shape of val_tf", val_tf.shape)

Shape of train_tf (369285, 65392)
Shape of test_tf (123096, 65392)
Shape of val_tf (123096, 65392)


### Reduce Dimensions using SVD

In [56]:
svd = TruncatedSVD(n_components=650)

In [57]:
train_sv = svd.fit_transform(train_tf)
test_sv = svd.fit_transform(test_tf)
val_sv = svd.fit_transform(val_tf)

In [58]:
print("Shape of train_sv", train_sv.shape)

print("Shape of test_sv", test_sv.shape)

print("Shape of val_sv", val_sv.shape)

Shape of train_sv (369285, 650)
Shape of test_sv (123096, 650)
Shape of val_sv (123096, 650)


## Classify CSR

First we will attempt to classify the full CSR dataset

### Naive Bayes
Interestingly this crashes and causes the runtime to reset as the dimensionality is too high.

In [29]:
# nb = GaussianNB()
# nb.fit(train_tf.todense(), y_train.astype(int))

In [30]:
# nb_score_test = nb.score(test_tf.todense(), y_test.astype(int))
# nb_score_test

### Linear Regression

In [31]:
linr = linear_model.LinearRegression(n_jobs = -1)
%time linr.fit(train_tf, y_train.astype(int))

Wall time: 38.3 s


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [32]:
%time linr_score_test = linr.score(test_tf, y_test.astype(int))
linr_score_test*100

Wall time: 7.51 ms


92.4105510072296

### Logistic Regression

In [33]:
lr = linear_model.LogisticRegression(C=1.0, penalty='l2', solver = 'saga', n_jobs=-1)
%time lr.fit(train_tf, y_train)

Wall time: 4.28 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=None, solver='saga', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
%time lr_score_test = lr.score(test_tf, y_test.astype(int))
lr_score_test*100

Wall time: 16 ms


98.28589068694352

### Support Vector Machine
The high dimensionality means that this takes a ridiculous amount of time to run.

In [35]:
# clf = svm.SVC(kernel='linear', C=1, random_state=42)
# %time clf.fit(train_tf, y_train.astype(int))

In [36]:
# %time clf_score = clf.score(test_tf, y_test.astype(int))
# print("SVM accuracy on test:\t %f" % clf_score)

### Random Forest
The high dimensionality means that this takes a ridiculous amount of time to run.

In [37]:
rf = RandomForestClassifier(criterion='gini', max_depth=5, 
                               min_samples_leaf=5, min_samples_split=2, 
                               n_estimators = 220, oob_score=True, 
                               max_features=0.5, n_jobs = -1, random_state=42)

%time rf.fit(train_tf, y_train.astype(int))

Wall time: 1min 7s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=220, n_jobs=-1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [38]:
%time rf_score = rf.score(test_tf, y_test.astype(int))
print("RF accuracy on test:\t %f" % rf_score)

Wall time: 544 ms
RF accuracy on test:	 0.753030


## Classify Reduced
We will now classify the dataset that was reduced to 50 dimensions.

### Naive Bayes
With the reduced dimensionality we can use a Naive Bayes classifier

In [59]:
nb_sv = GaussianNB()
%time nb_sv.fit(train_sv, y_train.astype(int))

Wall time: 4.54 s


GaussianNB(priors=None, var_smoothing=1e-09)

In [60]:
%time nb_score_test_sv = nb_sv.score(test_sv, y_test.astype(int))
nb_score_test_sv*100

Wall time: 1.81 s


49.83590043543251

### Linear Regression

In [61]:
linr_sv = linear_model.LinearRegression(n_jobs = -1)
%time linr_sv.fit(train_sv, y_train.astype(int))

Wall time: 15.3 s


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [62]:
%time linr_score_test_sv = linr_sv.score(test_sv, y_test.astype(int))
linr_score_test_sv*100

Wall time: 156 ms


-134.48372768948832

### Logistic Regression

In [63]:
lr_sv = linear_model.LogisticRegression(C=1.0, penalty='l2', solver = 'saga', n_jobs=-1)
%time lr_sv.fit(train_sv, y_train)

Wall time: 32.4 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=None, solver='saga', tol=0.0001,
          verbose=0, warm_start=False)

In [64]:
%time lr_score_test_sv = lr_sv.score(test_sv, y_test.astype(int))
lr_score_test_sv*100

Wall time: 152 ms


36.95083512055631

### Support Vector Machine
Even at low dimension this takes a lot of time to run.

In [45]:
# clf_sv = svm.SVC(kernel='linear', C=1, random_state=42)
# %time clf_sv.fit(train_sv, y_train.astype(int))

In [46]:
# %time clf_score_sv = clf_sv.score(test_sv, y_test.astype(int))
# print("SVM accuracy on test:\t %f" % clf_score_sv)

### Random Forest

In [65]:
rf_sv = RandomForestClassifier(criterion='gini', max_depth=5, 
                               min_samples_leaf=5, min_samples_split=2, 
                               n_estimators = 220, oob_score=True, 
                               max_features=0.5, n_jobs = -1, random_state=42)

%time rf_sv.fit(train_sv, y_train.astype(int))

Wall time: 30min 10s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=220, n_jobs=-1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [66]:
%time rf_score_sv = rf_sv.score(test_sv, y_test.astype(int))
print("RF accuracy on test:\t %f" % rf_score_sv)

Wall time: 898 ms
RF accuracy on test:	 0.170509


### Multi-layer Perceptron

In [49]:
NUM_CLASSES = 2

def build_mlp(input, NUM_CLASSES):
    model = keras.Sequential([
        keras.layers.Dense(8, activation='relu', input_shape=(input.shape[1],)),
        keras.layers.Dense(5, activation='relu'),
#         keras.layers.Dense(3, activation='relu'),
        keras.layers.Dense(NUM_CLASSES, activation='softmax')
    ])

    model.compile(loss='categorical_crossentropy', optimizer='adam', 
                  metrics=['mae', 'acc'])

    model.summary()

    return model


def train_mlp(dataframe, labels, model, checkpoint_path, EPOCHS):
    # Limit the training when there are multiple epochs with little change loss
    # The patience parameter is the amount of epochs to check for improvement
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss',
                                               patience=4000)

    # Create checkpoint callback
    cp_callback = keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                  save_weights_only=True,
                                                  verbose=0)

    # Track the training statistics
    history = model.fit(dataframe, labels, epochs=EPOCHS,
                        validation_split=0.2, verbose=0,
                        callbacks=[early_stop, EpochTrack(), cp_callback])

    print("\nEpochs: {}".format(len(history.epoch)))
    plot_history(history)

    return model

### Check Scores on Validation Set

In [50]:
linr_score_val = linr.score(val_tf, y_val.astype(int))
linr_score_val

0.9262120455079297

In [51]:
lr_score_val = lr.score(val_tf, y_val.astype(int))
lr_score_val

0.9833544550594658

In [52]:
clf_score_val = clf.score(X_val, y_val.astype(int))
clf_score_val

NameError: name 'clf' is not defined