In [1]:
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.feature_extraction import text

import warnings
warnings.filterwarnings('ignore')

I first got rid of all special characters using Unix command:
sed "s/[^a-zA-Z]/ /g" training_data.txt > new_training_data.txt.
My new_training_data.txt only contains alphabet letters and numbers.

In [2]:
# Load (new) training data and labels

train_datafile = open("./src/data_files/new_training_data.txt")
train_datafile_lines = [line.rstrip('\n') for line in train_datafile]
train_labels = np.loadtxt("./src/data_files/training_labels.txt",\
                          delimiter='\n', dtype=np.int32)

In [3]:
vectorizer = text.CountVectorizer()

training_data = vectorizer.fit_transform(train_datafile_lines)
words = vectorizer.vocabulary_
print("vocabulary size: ", len(words))
print(training_data.shape)

"""
The size of my feature set: 40924

"""


vocabulary size:  40924
(30000, 40924)


In [4]:
# Load validation data

val_datafile = open("./src/data_files/val_data.txt")
val_datafile_lines = [line.rstrip('\n') for line in val_datafile]

val_vectorizer = text.CountVectorizer(vocabulary=words)

val_data = val_vectorizer.fit_transform(val_datafile_lines)
val_labels = np.loadtxt("./src/data_files/val_labels.txt", delimiter='\n', dtype=np.int32)
# print(len(val_vectorizer.vocabulary_))

40924


## sklearn Logistic Regression

## Question 1.

In [5]:
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression()

In [6]:
logistic_regression.fit(training_data, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
# Measure the performance on training and validation data 
print("Training accuracy =", logistic_regression.score(training_data, train_labels))
print("Validation accuracy =", logistic_regression.score(val_data, val_labels))

'''
Training accuracy = 0.9754333333333334
Validation accuracy = 0.8802
'''

Training accuracy = 0.9754333333333334
Validation accuracy = 0.8802


The size of the feature set is the number of words in my word set, which is 40924.
Training accuracy = 0.9754333333333334
Validation accuracy = 0.8802.

## Question 2.

Re-train => change ngram_range from (1,1) to (1,2)

In [8]:
train_vectorizer2 = text.CountVectorizer(ngram_range=(1,2))
train_data2 = train_vectorizer2.fit_transform(train_datafile_lines)

In [9]:
print("vocabulary size: ", len(train_vectorizer2.vocabulary_))

"""
The size of my feature set after changing ngram_range from (1,1) to (1,2):  736981
"""

vocabulary size:  736981


In [10]:
logistic_regression2 = LogisticRegression()
logistic_regression2.fit(train_data2, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
val_vectorizer2 = text.CountVectorizer(vocabulary=train_vectorizer2.vocabulary_)
val_data2 = val_vectorizer2.fit_transform(val_datafile_lines)

In [12]:
# Measure the performance on training and validation data 
print("Training accuracy with ngram_range (1,2) =",\
      logistic_regression2.score(train_data2, train_labels))
print("Validation accuracy with ngram_range (1,2) =",\
      logistic_regression2.score(val_data2, val_labels))

"""
Training accuracy with ngram_range (1,2) = 0.9999
Validation accuracy with ngram_range (1,2) = 0.8763

"""

Training accuracy with ngram_range (1,2) = 0.9999
Validation accuracy with ngram_range (1,2) = 0.8763


The size of the feature set is the number of words in my word set, which is 736981.
Training accuracy = 0.9999
Validation accuracy = 0.8763. 

## Question 3.

The default value of C in sklearn LogisticRegression is 1.0, so the lambda = 1/C = 1.0. 

C = 1/lambda. try lambda = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]

In [13]:
lambdas = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000, 10000]
for lamb in lambdas:
    c = 1/lamb
    classifier = LogisticRegression(C=c)
    classifier.fit(train_data2, train_labels)
    
    # Measure the performance on training and validation data 
    print("Training accuracy [penalty=L2, ngram_range=(1,2), C={}] =".format(c),\
          classifier.score(train_data2, train_labels))
    print("Validation accuracy [penalty=L2, ngram_range=(1,2), C={}] =".format(c),\
          classifier.score(val_data2, val_labels))
    print("\n")

"""
Training accuracy [penalty=L2, ngram_range=(1,2), C=10000.0] = 1.0
Validation accuracy [penalty=L2, ngram_range=(1,2), C=10000.0] = 0.8603


Training accuracy [penalty=L2, ngram_range=(1,2), C=1000.0] = 1.0
Validation accuracy [penalty=L2, ngram_range=(1,2), C=1000.0] = 0.8634


Training accuracy [penalty=L2, ngram_range=(1,2), C=100.0] = 1.0
Validation accuracy [penalty=L2, ngram_range=(1,2), C=100.0] = 0.8705


Training accuracy [penalty=L2, ngram_range=(1,2), C=10.0] = 1.0
Validation accuracy [penalty=L2, ngram_range=(1,2), C=10.0] = 0.8727


Training accuracy [penalty=L2, ngram_range=(1,2), C=1.0] = 0.9999
Validation accuracy [penalty=L2, ngram_range=(1,2), C=1.0] = 0.8763


Training accuracy [penalty=L2, ngram_range=(1,2), C=0.1] = 0.9932
Validation accuracy [penalty=L2, ngram_range=(1,2), C=0.1] = 0.8772


Training accuracy [penalty=L2, ngram_range=(1,2), C=0.01] = 0.9431333333333334
Validation accuracy [penalty=L2, ngram_range=(1,2), C=0.01] = 0.8696


Training accuracy [penalty=L2, ngram_range=(1,2), C=0.001] = 0.8763
Validation accuracy [penalty=L2, ngram_range=(1,2), C=0.001] = 0.8384


Training accuracy [penalty=L2, ngram_range=(1,2), C=0.0001] = 0.8115
Validation accuracy [penalty=L2, ngram_range=(1,2), C=0.0001] = 0.7864


"""

Training accuracy [penalty=L2, ngram_range=(1,2), C=10000.0] = 1.0
Validation accuracy [penalty=L2, ngram_range=(1,2), C=10000.0] = 0.8603


Training accuracy [penalty=L2, ngram_range=(1,2), C=1000.0] = 1.0
Validation accuracy [penalty=L2, ngram_range=(1,2), C=1000.0] = 0.8634


Training accuracy [penalty=L2, ngram_range=(1,2), C=100.0] = 1.0
Validation accuracy [penalty=L2, ngram_range=(1,2), C=100.0] = 0.8705


Training accuracy [penalty=L2, ngram_range=(1,2), C=10.0] = 1.0
Validation accuracy [penalty=L2, ngram_range=(1,2), C=10.0] = 0.8727


Training accuracy [penalty=L2, ngram_range=(1,2), C=1.0] = 0.9999
Validation accuracy [penalty=L2, ngram_range=(1,2), C=1.0] = 0.8763


Training accuracy [penalty=L2, ngram_range=(1,2), C=0.1] = 0.9932
Validation accuracy [penalty=L2, ngram_range=(1,2), C=0.1] = 0.8772


Training accuracy [penalty=L2, ngram_range=(1,2), C=0.01] = 0.9431333333333334
Validation accuracy [penalty=L2, ngram_range=(1,2), C=0.01] = 0.8696


Training accuracy [penalty

Observation: The best validation accuracy came with C=0.1 with 0.8772.

## Question 4.

Since applying L1-regularization to the weight vector theta is defined by the sum of the absolute value of the elements in the vector, plotting theta in the coordinate system results in a diamond shape in #dimensions where # is the number of elements in theta. Before applying L1-regularization, the optimal value of the objective function sits somewhere in the coordinate. By moving away from that point by following the contour plots, it is highly likely that the contour plots hit the axes on which the diamond's angles sit. This means that those weights get set to 0, resulting in the sparsity of theta.

In [14]:
lambdas = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000, 10000]
for lamb in lambdas:
    c = 1/lamb
    classifier2 = LogisticRegression(penalty='l1', C=c)
    classifier2.fit(train_data2, train_labels)
    
    # Measure the performance on training and validation data 
    print("Training accuracy [penalty=L1, ngram_range=(1,2), C={}] =".format(c),\
          classifier2.score(train_data2, train_labels))
    print("Validation accuracy [penalty=L1, ngram_range=(1,2), C={}] =".format(c),\
          classifier2.score(val_data2, val_labels))
    print("\n")


"""
Training accuracy [penalty=L1, ngram_range=(1,2), C=10000.0] = 1.0
Validation accuracy [penalty=L1, ngram_range=(1,2), C=10000.0] = 0.8468


Training accuracy [penalty=L1, ngram_range=(1,2), C=1000.0] = 1.0
Validation accuracy [penalty=L1, ngram_range=(1,2), C=1000.0] = 0.8668


Training accuracy [penalty=L1, ngram_range=(1,2), C=100.0] = 1.0
Validation accuracy [penalty=L1, ngram_range=(1,2), C=100.0] = 0.8719


Training accuracy [penalty=L1, ngram_range=(1,2), C=10.0] = 1.0
Validation accuracy [penalty=L1, ngram_range=(1,2), C=10.0] = 0.8646


Training accuracy [penalty=L1, ngram_range=(1,2), C=1.0] = 0.9913
Validation accuracy [penalty=L1, ngram_range=(1,2), C=1.0] = 0.8685


Training accuracy [penalty=L1, ngram_range=(1,2), C=0.1] = 0.9111
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.1] = 0.8723


Training accuracy [penalty=L1, ngram_range=(1,2), C=0.01] = 0.8334666666666667
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.01] = 0.8204


Training accuracy [penalty=L1, ngram_range=(1,2), C=0.001] = 0.681
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.001] = 0.6725


Training accuracy [penalty=L1, ngram_range=(1,2), C=0.0001] = 0.4987666666666667
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.0001] = 0.5013


"""

Training accuracy [penalty=L1, ngram_range=(1,2), C=10000.0] = 1.0
Validation accuracy [penalty=L1, ngram_range=(1,2), C=10000.0] = 0.8468


Training accuracy [penalty=L1, ngram_range=(1,2), C=1000.0] = 1.0
Validation accuracy [penalty=L1, ngram_range=(1,2), C=1000.0] = 0.8668


Training accuracy [penalty=L1, ngram_range=(1,2), C=100.0] = 1.0
Validation accuracy [penalty=L1, ngram_range=(1,2), C=100.0] = 0.8719


Training accuracy [penalty=L1, ngram_range=(1,2), C=10.0] = 1.0
Validation accuracy [penalty=L1, ngram_range=(1,2), C=10.0] = 0.8646


Training accuracy [penalty=L1, ngram_range=(1,2), C=1.0] = 0.9913
Validation accuracy [penalty=L1, ngram_range=(1,2), C=1.0] = 0.8685


Training accuracy [penalty=L1, ngram_range=(1,2), C=0.1] = 0.9111
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.1] = 0.8723


Training accuracy [penalty=L1, ngram_range=(1,2), C=0.01] = 0.8334666666666667
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.01] = 0.8204


Training accuracy [penalty

Observation: The best validation accuracy came with C=0.1 with 0.8723.

## Question 5.

The difference in the validation accuracy of ngram_range = (1,1) and ngram_range = (1,2) was very small. Similarly, there is not a clear winner between L1 and L2 regularizations since the validation accuracies are very close. However, in both cases, the best validation accuracy was observed when C=0.1 (lambda=10). SGDClassifier with loss='log' is a logistic regression model. Using this model allows elasticnet penalty, which mixes L2 and L1 regularizations. 

In [15]:
from sklearn.linear_model import SGDClassifier

alphas = [1e-5, 1e-4, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
for al in alphas:
    SGD_classifier = SGDClassifier(loss='log', penalty='elasticnet', alpha=al)
    SGD_classifier.fit(train_data2, train_labels)
    
    # Measure the performance on training and validation data 
    print("Training accuracy [penalty=elasticnet, ngram_range=(1,2), C={}] =".format(al),\
          SGD_classifier.score(train_data2, train_labels))
    print("Validation accuracy [penalty=L1, ngram_range=(1,2), C={}] =".format(al),\
          SGD_classifier.score(val_data2, val_labels))
    print("\n")

    
"""
Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=1e-05] = 0.971
Validation accuracy [penalty=L1, ngram_range=(1,2), C=1e-05] = 0.8524


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=0.0001] = 0.9707333333333333
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.0001] = 0.8596


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=0.0001] = 0.9687333333333333
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.0001] = 0.8604


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=0.001] = 0.8767
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.001] = 0.8102


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=0.01] = 0.8416333333333333
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.01] = 0.83


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=0.1] = 0.7230333333333333
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.1] = 0.7194


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=1] = 0.5012333333333333
Validation accuracy [penalty=L1, ngram_range=(1,2), C=1] = 0.4987


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=10] = 0.5012333333333333
Validation accuracy [penalty=L1, ngram_range=(1,2), C=10] = 0.4987


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=100] = 0.5012333333333333
Validation accuracy [penalty=L1, ngram_range=(1,2), C=100] = 0.4987



"""

Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=1e-05] = 0.971
Validation accuracy [penalty=L1, ngram_range=(1,2), C=1e-05] = 0.8524


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=0.0001] = 0.9707333333333333
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.0001] = 0.8596


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=0.0001] = 0.9687333333333333
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.0001] = 0.8604


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=0.001] = 0.8767
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.001] = 0.8102


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=0.01] = 0.8416333333333333
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.01] = 0.83


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=0.1] = 0.7230333333333333
Validation accuracy [penalty=L1, ngram_range=(1,2), C=0.1] = 0.7194


Training accuracy [penalty=elasticnet, ngram_range=(1,2), C=1] = 0.50

Observation: The best validation accuracy of (0.8671) is achieved with C=0.001, but this is very close to the above models and their parameters. Since some models have very similar accuracies, let's try Cross-Validation with f1 and accuracy scoring metrics to determine the best model!

In [16]:
# ngram_range(1,1)
cv_1 = cross_validate(logistic_regression, training_data, train_labels,\
                      scoring=('f1', 'accuracy'))
print("cross_validate, [scoring = f1, ngram_range(1,1)] = ", cv_1['test_f1'])
print("cross_validate, [scoring = accuracy, ngram_range(1,1)] = ", cv_1['test_accuracy'])

"""
cross_validate, [scoring = f1, ngram_range(1,1)] =  [0.87653587 0.87782178 0.8803334 ]
cross_validate, [scoring = accuracy, ngram_range(1,1)] =  [0.87541246 0.8766     0.87938794]
"""

cross_validate, [scoring = f1, ngram_range(1,1)] =  [0.87653587 0.87782178 0.8803334 ]
cross_validate, [scoring = accuracy, ngram_range(1,1)] =  [0.87541246 0.8766     0.87938794]


In [17]:
# ngram_range(1,2)

cv_2 = cross_validate(logistic_regression2, training_data, train_labels,\
                      scoring=('f1', 'accuracy'))
print("cross_validate, [scoring = f1, ngram_range(1,2)] = ", cv_2['test_f1'])
print("cross_validate, [scoring = accuracy, ngram_range(1,2)] = ", cv_2['test_accuracy'])

"""
cross_validate, [scoring = f1, ngram_range(1,2)] =  [0.87653587 0.87782178 0.8803334 ]
cross_validate, [scoring = accuracy, ngram_range(1,2)] =  [0.87541246 0.8766     0.87938794]
"""

cross_validate, [scoring = f1, ngram_range(1,2)] =  [0.87653587 0.87782178 0.8803334 ]
cross_validate, [scoring = accuracy, ngram_range(1,2)] =  [0.87541246 0.8766     0.87938794]


In [18]:
# L2 regularization, ngram_range(1,2), C=0.1

c3 = LogisticRegression(C=0.1).fit(train_data2, train_labels)
cv_3 = cross_validate(c3, training_data, train_labels, scoring=('f1', 'accuracy'))
print("cross_validate, [scoring = f1, ngram_range(1,2)] = ", cv_3['test_f1'])
print("cross_validate, [scoring = accuracy, ngram_range(1,2)] = ", cv_3['test_accuracy'])

"""
cross_validate, [scoring = f1, ngram_range(1,2)] =  [0.880976   0.8834817  0.88453547]
cross_validate, [scoring = accuracy, ngram_range(1,2)] =  [0.880012   0.8822     0.88328833]
"""

cross_validate, [scoring = f1, ngram_range(1,2)] =  [0.880976   0.8834817  0.88453547]
cross_validate, [scoring = accuracy, ngram_range(1,2)] =  [0.880012   0.8822     0.88328833]


In [19]:
# L1 regularization, ngram_range(1,2), C=0.1
c4 = LogisticRegression(penalty='l1', C=0.1).fit(train_data2, train_labels)
cv_4 = cross_validate(c4, training_data, train_labels, scoring=('f1', 'accuracy'))
print("cross_validate, [scoring = f1, ngram_range(1,2)] = ", cv_4['test_f1'])
print("cross_validate, [scoring = accuracy, ngram_range(1,2)] = ", cv_4['test_accuracy'])

"""
cross_validate, [scoring = f1, ngram_range(1,2)] =  [0.87023656 0.87391304 0.8756917 ]
cross_validate, [scoring = accuracy, ngram_range(1,2)] =  [0.86891311 0.8724     0.87418742]
"""

cross_validate, [scoring = f1, ngram_range(1,2)] =  [0.87023656 0.87391304 0.8756917 ]
cross_validate, [scoring = accuracy, ngram_range(1,2)] =  [0.86891311 0.8724     0.87418742]


In [20]:
# Elastic Net regularization, ngram_range(1,2), C=0.001
c5 = SGDClassifier(loss='log', penalty='elasticnet', alpha=0.001).fit(train_data2, train_labels)
cv_5 = cross_validate(c5, training_data, train_labels, scoring=('f1', 'accuracy'))
print("cross_validate, [scoring = f1, ngram_range(1,2)] = ", cv_5['test_f1'])
print("cross_validate, [scoring = accuracy, ngram_range(1,2)] = ", cv_5['test_accuracy'])

"""
cross_validate, [scoring = f1, ngram_range(1,2)] =  [0.87587685 0.87362137 0.87384191]
cross_validate, [scoring = accuracy, ngram_range(1,2)] =  [0.87261274 0.8751     0.8719872 ]
"""

cross_validate, [scoring = f1, ngram_range(1,2)] =  [0.87587685 0.87362137 0.87384191]
cross_validate, [scoring = accuracy, ngram_range(1,2)] =  [0.87261274 0.8751     0.8719872 ]


It looks like using the F1 metric yields slightly better scores, and the best cross_validation score achieved was (cv_3) L2 regularization, ngram_range(1,2), C=0.1 with the mean F1 score of 0.883. So, I will use (cv_3) to make predictions for the test data.

As seen above, L2 regularization, ngram_range(1,2), C=0.1 yields training accuracy of 0.9932 and validation accuracy of 0.8772.

In [21]:
test_datafile = open("./src/data_files/test_data.txt")
test_datafile_lines = [line.rstrip('\n') for line in test_datafile]

test_vectorizer = text.CountVectorizer(vocabulary=train_vectorizer2.vocabulary_)
test_data = test_vectorizer.fit_transform(test_datafile_lines)

In [22]:
predictions = c3.predict(test_data)

In [23]:
np.savetxt("hc2kc-lr-test.pred", predictions.astype(int), fmt='%.0f')