### Import the data

In [38]:
import pandas as pd
# to read
df = pd.read_csv('Twitter_Dataset_cleaned_1.csv', quotechar='"', error_bad_lines=False)
df.head(3)

Unnamed: 0,sentiment,text
0,1,is working on her listography. http://plurk.c...
1,1,@pxieVAMPIREdust why HELLO
2,1,Got a new guitaaar thank you Dad!


### Shuffle the Data

In [39]:
# shuffle the dataset and save it out
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('Twitter_Dataset_cleaned_1.csv', index=False, encoding='utf-8')

### Split the Data into Training and Testing Sets

In [42]:
# go 50/50 to start with
df = df.reset_index(drop=True)
half = int(len(df) / 2)

X_train = df.loc[:half, 'sentiment'].values
y_train = df.loc[:half, 'text'].values
X_test = df.loc[half:, 'sentiment'].values
y_test = df.loc[half:, 'text'].values
print(len(X_train))
print(len(X_test))

789307
789306


### Algorithms for Cleaning and Streaming Training Data into Batches

In [43]:
import numpy as np
import re
from nltk.corpus import stopwords

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            label = int(line[:1])
            text = line[1:]
            yield text, label
            
            
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

### Vectorize the Data

In [44]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

doc_stream = stream_docs(path='Twitter_Dataset_cleaned_1.csv')

### Train on a Logarithmic Stochastic Gradient Decent Model

In [45]:
import pyprind
pbar = pyprind.ProgBar(45)

clf = SGDClassifier(loss='log', random_state=1, max_iter=1)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:02


In [46]:
test = ["The food at the cafeteria has gotten worse and worse."]
print(tokenizer(test[0]))

test = vect.transform(test)
print(test)
print(clf.decision_function(test))
print(clf.predict(test))

['food', 'cafeteria', 'gotten', 'worse', 'worse']
  (0, 42885)	0.755928946018
  (0, 361590)	0.377964473009
  (0, 614307)	0.377964473009
  (0, 846025)	-0.377964473009
[-0.58657387]
[0]


### Test the Model with the Testing Set

In [47]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.740
