### Clean the data

In [1]:
import pandas as pd

path = 'Twitter_Dataset_cleaned_2.csv'
df = pd.read_csv(path, encoding='utf-8')

In [2]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [3]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/camdeardorff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [5]:
# go 50/50 to start with
df = df.reset_index(drop=True)
divide = int(len(df) / 2)

X_train = df.loc[:divide, 'text'].values
y_train = df.loc[:divide, 'sentiment'].values
X_test = df.loc[divide:, 'text'].values
y_test = df.loc[divide:, 'sentiment'].values
# print("train: {}".format(len(X_train)))
# print("test: {}".format(len(X_test)))
print(y_train)

[1 0 1 ... 0 0 0]


In [6]:
from nltk.corpus import stopwords

stop = stopwords.words('english')

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

In [None]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Pickling array (shape=(789307,), dtype=object).
Memmaping (shape=(789307,), dtype=int64) to new file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-5352186733a55e910ea3f2211a76be9e.pkl
Memmaping (shape=(631444,), dtype=int64) to new file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-11be0b960ec33597e3e703e1d66716d9.pkl
Memmaping (shape=(157863,), dtype=int64) to new file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-c4dcb5c219bb5a4dc1f26e6ca48cb35e.pkl
Pickling array (shape=(789307,), dtype=object).
Memmaping (shape=(789307,), dtype=int64) to old file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-5352186733a55e910ea3f2211a76be9e.pkl
Memmaping (shape=(631446,), dtype=int64) to new file /var/folders/_2/3

[CV]  clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'othe

Pickling array (shape=(789307,), dtype=object).
Memmaping (shape=(789307,), dtype=int64) to old file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-5352186733a55e910ea3f2211a76be9e.pkl
Memmaping (shape=(631446,), dtype=int64) to old file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-a5942b59cd08b253312aa567efb72fae.pkl
Memmaping (shape=(157861,), dtype=int64) to old file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-39025e4101a47e3d45b3d0ba0d93c2e1.pkl
[CV] clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'thems

[CV] clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other

Pickling array (shape=(789307,), dtype=object).
Memmaping (shape=(789307,), dtype=int64) to old file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-5352186733a55e910ea3f2211a76be9e.pkl
Memmaping (shape=(631446,), dtype=int64) to old file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-7f14d8632b162b5ff8f8a0a978769514.pkl
Memmaping (shape=(157861,), dtype=int64) to old file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-714ef86a875925cfdefcc91118570420.pkl
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 12.4min
[CV] clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x10178b488> 
Pickling array (shape=(789307,), dtype=object).
Memmaping (shape=(789307,), dtype=int64) to old file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_907

Pickling array (shape=(789307,), dtype=object).
Memmaping (shape=(789307,), dtype=int64) to old file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-5352186733a55e910ea3f2211a76be9e.pkl
Memmaping (shape=(631446,), dtype=int64) to old file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-fe0235ad62a6314cd67dc540fde1c10e.pkl
Memmaping (shape=(157861,), dtype=int64) to old file /var/folders/_2/33c2srkx1gb7nsbhylt_vd_40000gn/T/joblib_memmaping_pool_9079_4340007600/9079-4595790792-dc904c30f4a35c5bcf99f272a13a860e.pkl
[CV]  clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'them

In [None]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

In [None]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

In [7]:
import numpy as np
import re
from nltk.corpus import stopwords

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            label = int(line[:1])
            text = line[1:]
            yield text, label

In [8]:
next(stream_docs(path=path))

(',@ayashcliche goodnight \n', 1)

In [9]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [10]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path=path)

In [11]:
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:02


In [12]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.739


In [14]:
import coremltools

input_features = ["message"]
output_feature = "sentiment"

model = coremltools.converters.sklearn.convert(clf, input_features, output_feature)
model.save("Sentiment.mlmodel")

ValueError: Transformer 'SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=1, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=1, shuffle=True,
       tol=None, verbose=0, warm_start=False)' not supported; supported transformers are coremltools.converters.sklearn._dict_vectorizer,coremltools.converters.sklearn._one_hot_encoder,coremltools.converters.sklearn._normalizer,coremltools.converters.sklearn._standard_scaler,coremltools.converters.sklearn._imputer,coremltools.converters.sklearn._NuSVC,coremltools.converters.sklearn._NuSVR,coremltools.converters.sklearn._SVC,coremltools.converters.sklearn._SVR,coremltools.converters.sklearn._linear_regression,coremltools.converters.sklearn._LinearSVC,coremltools.converters.sklearn._LinearSVR,coremltools.converters.sklearn._logistic_regression,coremltools.converters.sklearn._random_forest_classifier,coremltools.converters.sklearn._random_forest_regressor,coremltools.converters.sklearn._decision_tree_classifier,coremltools.converters.sklearn._decision_tree_regressor,coremltools.converters.sklearn._gradient_boosting_classifier,coremltools.converters.sklearn._gradient_boosting_regressor.