### Clean the data

In [23]:
import pandas as pd

path = 'Twitter_Dataset_cleaned_2.csv'
df = pd.read_csv(path, encoding='utf-8')

In [24]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [25]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/camdeardorff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [27]:
# go 50/50 to start with
df = df.reset_index(drop=True)
divide = int(len(df) / 2)

X_train = df.loc[:divide, 'text'].values
y_train = df.loc[:divide, 'sentiment'].values
X_test = df.loc[divide:, 'text'].values
y_test = df.loc[divide:, 'sentiment'].values
print("train: {}".format(len(X_train)))
print("test: {}".format(len(X_test)))

train: 789307
test: 789306


In [44]:
import numpy as np
import re
from nltk.corpus import stopwords

def tokenizer(text):
    #Remove html
    text = re.sub('<[^>]*>', '', text)
    #Remove hyperlinks
    text = re.sub(r"http\S+", '', text, flags=re.MULTILINE)
    #Remove quotes
    text = re.sub(r'&amp;quot;|&amp;amp', '', text)
    #Remove citations
    text = re.sub(r'(@[a-zA-Z0-9])\w*', '', text)
    #Remove hashtags
    text = re.sub(r'(#[a-zA-Z0-9])\w*', '', text)
    #Remove tickers
    text = re.sub(r'\$[a-zA-Z0-9]*', '', text)
    #Remove numbers
    text = re.sub(r'[0-9]*','',text)
    

    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
        
        
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            label = int(line[:1])
            text = line[1:]
            yield text, label
            
print(tokenizer("@hello therefore data mining #hashtag http://google.com http://url.com/bla1/blah1/"))

['therefore', 'data', 'mining']


In [51]:
next(stream_docs(path=path))

(',@ayashcliche goodnight \n', 1)

In [46]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [47]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path=path)

In [48]:
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


In [49]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.738


In [50]:
import coremltools

input_features = ["message"]
output_feature = "sentiment"

model = coremltools.converters.sklearn.convert(clf, input_features, output_feature)
model.save("Sentiment.mlmodel")

ValueError: Transformer 'SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=1, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=1, shuffle=True,
       tol=None, verbose=0, warm_start=False)' not supported; supported transformers are coremltools.converters.sklearn._dict_vectorizer,coremltools.converters.sklearn._one_hot_encoder,coremltools.converters.sklearn._normalizer,coremltools.converters.sklearn._standard_scaler,coremltools.converters.sklearn._imputer,coremltools.converters.sklearn._NuSVC,coremltools.converters.sklearn._NuSVR,coremltools.converters.sklearn._SVC,coremltools.converters.sklearn._SVR,coremltools.converters.sklearn._linear_regression,coremltools.converters.sklearn._LinearSVC,coremltools.converters.sklearn._LinearSVR,coremltools.converters.sklearn._logistic_regression,coremltools.converters.sklearn._random_forest_classifier,coremltools.converters.sklearn._random_forest_regressor,coremltools.converters.sklearn._decision_tree_classifier,coremltools.converters.sklearn._decision_tree_regressor,coremltools.converters.sklearn._gradient_boosting_classifier,coremltools.converters.sklearn._gradient_boosting_regressor.