# Step 1: Data PreProcessing

In [1]:
import gensim

In [45]:
#READING A DATA FILE (TAGS range from 0 to len(data))
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def read_corpus(fname, tokens_only=False):
    with open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])


#THIS CREATES A TRAINING CORPUS
train_corpus = list(read_corpus("/train.txt"))

In [41]:
len(train_corpus)

963

INSTANTIATE THE GENSIM DOC2VEC MODEL

In [47]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

Create the Vocabulary, Train the model using train_corpus, Store vectors

In [48]:
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
train_vectors = []
for i in range(len(train_corpus)):
    vector = model.infer_vector(train_corpus[i].words)
    train_vectors.append(vector)

Stack vectors, read training labels as an array

In [50]:
import numpy as np
import pandas as pd

X_train = np.vstack(X)
print(X_train.shape)

y_train = pd.read_table('/trainlabels.txt', header = None)
y_train.columns = ['y']
y_train = np.array(y_train['y'])
print(y_train.shape)

(963, 50)
(963,)


Repeat the above steps with the validation and test data (no need to retrain the genism model)

In [51]:
validation_corpus = list(read_corpus("/validation.txt"))
len(validation_corpus)

118

In [52]:
val_vectors = []

for i in range(len(validation_corpus)):
    vector = model.infer_vector(validation_corpus[i].words)
    val_vectors.append(vector)

In [55]:
X_validation = np.vstack(val_vectors)
print(X_validation.shape)

y_validation = pd.read_table('/validationlabels.txt', header = None)
y_validation.columns = ['y']
y_validation = np.array(y_validation['y'])
print(y_validation.shape)

(118, 50)
(118,)


In [56]:
test_corpus = list(read_corpus("/test.txt"))
len(test_corpus)

122

In [57]:
test_vectors = []

for i in range(len(test_corpus)):
    vector = model.infer_vector(test_corpus[i].words)
    test_vectors.append(vector)

In [59]:
X_test = np.vstack(test_vectors)
print(X_test.shape)

y_test = pd.read_table('/testlabels.txt', header = None)
y_test.columns = ['y']
y_test = np.array(y_test['y'])
print(y_test.shape)

(122, 50)
(122,)


# Step 2: Train classifiers and test using validation data

1. Logistic regression model

First do not set any penalty to set a baseline

In [84]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

clf = LogisticRegression(solver = 'saga', max_iter = 5000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_validation)
print("No Penalty", f1_score(y_pred, y_validation))

No Penalty 0.4893617021276596


Test model with an L1, L2, and Elasticnet penalty

In [85]:
clf = LogisticRegression(solver = 'saga', max_iter = 5000, penalty = 'l1')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_validation)
print("L1 penalty",f1_score(y_pred, y_validation))

clf = LogisticRegression(solver = 'saga', max_iter = 5000, penalty = 'l2')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_validation)
print("L2 penalty",f1_score(y_pred, y_validation))

clf = LogisticRegression(solver = 'saga', max_iter = 5000, penalty = 'elasticnet', l1_ratio = 0.7)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_validation)
print("Elasticnet penality",f1_score(y_pred, y_validation))

L1 penalty 0.4946236559139786
L2 penalty 0.4893617021276596
Elasticnet penality 0.4946236559139786


Increase in F1 score for L1 and Elasticnet

2. Random forest model

In [86]:
from sklearn.ensemble import RandomForestClassifier

In [92]:
scores = []
for i in range(1, 15):
    clf = RandomForestClassifier(max_depth=i)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_validation)
    score = f1_score(y_pred, y_validation)
    print(score)
    scores.append(score)
    
best_depth = (scores.index(max(scores))+1)

print('The best F1 score of', max(scores), "with an optimal depth of", best_depth)

0.6870229007633588
0.6607142857142857
0.6846846846846847
0.6666666666666666
0.6964285714285715
0.6542056074766355
0.6371681415929203
0.6407766990291263
0.6481481481481481
0.5686274509803921
0.6226415094339623
0.6846846846846847
0.6728971962616823
0.6336633663366336
The best F1 score of 0.6964285714285715 with an optimal depth of 5


Random Forest classifier is out performing Logistic Regression for this dataset.

3. Neural Network

Constructing a loop to see best number of hidden layers for one layer

In [100]:
from sklearn.neural_network import MLPClassifier
scores = []
for i in range(1,22, 3):
    clf = MLPClassifier(hidden_layer_sizes = (i,), activation = 'tanh', max_iter=2000).fit(X_train, y_train)
    y_pred = clf.predict(X_validation)
    score = f1_score(y_pred, y_validation)
    print(score)
    scores.append(score)
    
best_index = (scores.index(max(scores)) + 1)
print('The optimal index was', best_index, 'and the best F1 score for 1 hidden node within 1 hidden layer was', max(scores))

0.5436893203883496
0.6153846153846154
0.5714285714285715
0.43478260869565216
0.5656565656565657
0.5154639175257731
0.5544554455445545
The optimal index was 2 and the best F1 score for 1 hidden node within 1 hidden layer was 0.6153846153846154


Testing 2 and 3 hidden nodes as well

In [107]:
scores = []
for i in range(2,4):
    clf = MLPClassifier(hidden_layer_sizes = (i,), activation = 'tanh', max_iter=2000).fit(X_train, y_train)
    y_pred = clf.predict(X_validation)
    score = f1_score(y_pred, y_validation)
    print(score)
    scores.append(score)

0.4893617021276596
0.5050505050505051


1 Hidden Node yielded the best F1 score compared to 2 and 3 but the 3rd one was better than the 2nd.

Repeating the same procedure with the number of hidden layers using the 3 hidden nodes from above

In [113]:
scores = []
for i in range(1,22, 3):
    clf = MLPClassifier(hidden_layer_sizes = (3, i), activation = 'tanh', max_iter=2000).fit(X_train, y_train)
    y_pred = clf.predict(X_validation)
    score = f1_score(y_pred, y_validation)
    print(score)
    scores.append(score)
    
best_index = (scores.index(max(scores)) + 1)
print('The optimal index was', best_index, 'With an F1 score of', max(scores),'for 3 hidden nodes')

0.607843137254902
0.5052631578947367
0.46808510638297873
0.54
0.5252525252525252
0.5283018867924527
0.5825242718446603
The optimal index was 1 With an F1 score of 0.607843137254902 for 3 hidden nodes


Testing with 2 and 3 hidden layers using 3 hidden nodes

In [114]:
scores = []
for i in range(2,4):
    clf = MLPClassifier(hidden_layer_sizes = (3,i), activation = 'tanh', max_iter=2000).fit(X_train, y_train)
    y_pred = clf.predict(X_validation)
    score = f1_score(y_pred, y_validation)
    print(score)
    scores.append(score)

0.4545454545454545
0.5263157894736842


The performance descreased. Will use 1 hidden nodes within 1 hidden layer on the testing data since that was the highest F1 score.

# Step 3: Evaluate performance on Test set using best parameters found from Validation set.

1. Test the Logistic Regression Model

In [116]:
from sklearn.metrics import precision_score, recall_score

In [117]:
clf = LogisticRegression(solver = 'saga', max_iter = 5000, penalty = 'l1') 
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('F1 score: ', f1_score(y_pred, y_test))
print('Precision: ', precision_score(y_pred, y_test))
print('Recall: ', recall_score(y_pred, y_test))

F1 score:  0.6666666666666666
Precision:  0.5964912280701754
Recall:  0.7555555555555555


Above using l1 penalty, below using elasticnet penalty since both were F1 scores were tied for highest on Validation set.

In [119]:
clf = LogisticRegression(solver = 'saga', max_iter = 5000, penalty = 'elasticnet',l1_ratio = 0.7) 
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('F1 score: ', f1_score(y_pred, y_test))
print('Precision: ', precision_score(y_pred, y_test))
print('Recall: ', recall_score(y_pred, y_test))

F1 score:  0.6666666666666666
Precision:  0.5964912280701754
Recall:  0.7555555555555555


Performance on Training data

In [120]:
y_pred = clf.predict(X_train)
print('F1 score =', f1_score(y_pred, y_train))
print('Precision =', precision_score(y_pred, y_train))
print('Recall =', recall_score(y_pred, y_train))

F1 score = 0.8051434223541049
Precision = 0.814
Recall = 0.7964774951076321


Performance on Training set is best, much better than the Test set. However, the performance on the Validation sets were the worst. Slightly overfitting.

2. Test the Random Forest

In [121]:
clf = RandomForestClassifier(max_depth = 5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('F1 score =', f1_score(y_pred, y_test))
print('Precision =', precision_score(y_pred, y_test))
print('Recall =', recall_score(y_pred, y_test))

F1 score = 0.7387387387387387
Precision = 0.7192982456140351
Recall = 0.7592592592592593


Performance on Training data

In [122]:
y_pred = clf.predict(X_train)
print('F1 score =', f1_score(y_pred, y_train))
print('Precision =', precision_score(y_pred, y_train))
print('Recall =', recall_score(y_pred, y_train))

F1 score = 0.9103313840155945
Precision = 0.934
Recall = 0.8878326996197718


This one is also slightly overfitting the data since the performance of the Training set is higher than the Test set. However, the Random Forest classifier gives better performance than Logistic Regression for Test data.

3. Test the Neural Network

In [123]:
clf = MLPClassifier(hidden_layer_sizes = (1, 1), activation = 'tanh', max_iter=2000).fit(X_train, y_train)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('F1 score =', f1_score(y_pred, y_test))
print('Precision =', precision_score(y_pred, y_test))
print('Recall =', recall_score(y_pred, y_test))

F1 score = 0.6846846846846847
Precision = 0.6666666666666666
Recall = 0.7037037037037037


Check Performance on Training data

In [124]:
y_pred = clf.predict(X_train)
print('F1 score =', f1_score(y_pred, y_train))
print('Precision =', precision_score(y_pred, y_train))
print('Recall =', recall_score(y_pred, y_train))

F1 score = 0.8282630029440627
Precision = 0.844
Recall = 0.8131021194605009


The Neural Network is numerically the 2nd best classifier surprisingly. Given we do not have large amounts of data, we expected the test scores to be lowest. Again, the Training data performed better than the Test set but not overfitting as much as Random Forests.

# Conclusion

- The neural network is the best classifier taking into consideration the training and test scores and the amount of potential overfitting.
- Although Random Forest classifier had the largest F1 scores, it overfit the data. 


- To further improve, gathering more data would be more helpful. Testing out more different paremeters with something like gridsearchcv if our computers were more powerful and could more easily handle more combinations of parameters. 
