# Classification Using the tf-idf Representation

Attempt which uses no word embeddings at all.
This gets about 0.79 on Kaggle with very little custom tuning.

In [77]:
import os

import numpy as np
from sklearn.linear_model import SGDClassifier

In [2]:
TRAIN = os.path.join('..', 'data', 'train')
TEST = os.path.join('..', 'data', 'test')
POS_TWEET_FILE = os.path.join(TRAIN, 'train_pos_full.txt')
NEG_TWEET_FILE = os.path.join(TRAIN, 'train_neg_full.txt')
TEST_TWEET_FILE = os.path.join(TEST, 'test_data.txt')
EMBEDDING_SIZE = 300

In [13]:
def read_tweets(fname):
    """Read the tweets in the given file."""
    with open(fname, 'r') as f:
        return [l for l in f.readlines()]

In [14]:
pos_tweets = read_tweets(POS_TWEET_FILE)

In [15]:
neg_tweets = read_tweets(NEG_TWEET_FILE)

In [16]:
test_tweets = read_tweets(TEST_TWEET_FILE)

In [61]:
# sentences = pos_tweets + neg_tweets + test_tweets
lim = 1250000
sentences = pos_tweets[:lim] + neg_tweets[:lim] + test_tweets[:lim]
y_full = [+1] * lim + [-1] * lim
print(len(sentences))

2510000


In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [63]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')

In [69]:
X_full = vectorizer.fit_transform(sentences)
print(X_full.shape)

X_valid = X_full[-10000:]
X_full = X_full[:-10000]

print("Shape after removing validation data: {0}".format(X_full.shape))

(2510000, 444159)
Shape after removing validation data: (2500000, 444159)


In [80]:
print(X_full[220])
print()
print(X_full[2555])

  (0, 191032)	0.425446586829
  (0, 278934)	0.56536793347
  (0, 427085)	0.57178154977
  (0, 227970)	0.288475958544
  (0, 385380)	0.229710816262
  (0, 205058)	0.190878817935

  (0, 326023)	0.579803551963
  (0, 395768)	0.666397465816
  (0, 151313)	0.468766742298


In [70]:
from sklearn.grid_search import *
from sklearn.metrics import *

grid = {
    'loss': ['hinge', 'log'],
    'alpha': [1e-6, 5e-6, 0.00001, 0.00005, 0.0001, 0.0005],
}

def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("{2}: Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores),
              i + 1))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [71]:
def eval_tfidf():
    gs = GridSearchCV(SGDClassifier(), grid, cv=5, verbose=True, n_jobs=4)
    print("Starting grid search...")
    res = gs.fit(X_full, y_full)
    report(res.grid_scores_, n_top=25)
    
    predY = res.predict(X_full)
    acc = accuracy_score(y_full, predY)
    f1 = accuracy_score(y_full, predY)
    
    print("Train accuracy: {0}\nTrain F1 score: {1}".format(acc, f1))
    
    return res  

In [72]:
res = eval_tfidf()

Starting grid search...
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:  1.7min finished


1: Mean validation score: 0.801 (std: 0.001)
Parameters: {'alpha': 1e-06, 'loss': 'hinge'}

2: Mean validation score: 0.801 (std: 0.001)
Parameters: {'alpha': 1e-06, 'loss': 'log'}

3: Mean validation score: 0.800 (std: 0.001)
Parameters: {'alpha': 5e-06, 'loss': 'hinge'}

4: Mean validation score: 0.797 (std: 0.001)
Parameters: {'alpha': 1e-05, 'loss': 'hinge'}

5: Mean validation score: 0.795 (std: 0.001)
Parameters: {'alpha': 5e-06, 'loss': 'log'}

6: Mean validation score: 0.791 (std: 0.001)
Parameters: {'alpha': 1e-05, 'loss': 'log'}

7: Mean validation score: 0.784 (std: 0.001)
Parameters: {'alpha': 5e-05, 'loss': 'hinge'}

8: Mean validation score: 0.776 (std: 0.002)
Parameters: {'alpha': 0.0001, 'loss': 'hinge'}

9: Mean validation score: 0.775 (std: 0.002)
Parameters: {'alpha': 5e-05, 'loss': 'log'}

10: Mean validation score: 0.764 (std: 0.002)
Parameters: {'alpha': 0.0001, 'loss': 'log'}

11: Mean validation score: 0.747 (std: 0.002)
Parameters: {'alpha': 0.0005, 'loss': 'hi

## Submitting to Kaggle

In [73]:
kaggle_tfidf = res.predict(X_valid)

In [76]:
import time
timestamp = int(time.time())

with open('../data/output/pred-tfidf-{0}.csv'.format(timestamp), 'w') as f:
    f.write("Id,Prediction\n")
    for i, pred in enumerate(kaggle_tfidf):
        f.write("{0},{1}\n".format(i + 1, pred))