In [6]:
datafile = "a-dev.dist.txt.csv"

In [7]:
import numpy as np
import pandas as pd

df = pd.read_csv(datafile, na_values='', encoding='utf-8')
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.fillna('')
tweets = df.as_matrix()[:, :]

print("number of tweets: ", len(tweets))

number of tweets:  1663


In [8]:
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from string import punctuation


# nltk.download()

# Create features
def features(sentence):
    sentence = re.sub('<[^>]*>', '', sentence)
    #Remove hyperlinks
    sentence = re.sub(r"http\S+", '', sentence, flags=re.MULTILINE)
    #Remove quotes
    sentence = re.sub(r'&amp;quot;|&amp;amp', '', sentence)
    #Remove citations
    sentence = re.sub(r'(@[a-zA-Z0-9])\w*', '', sentence)
    #Remove hashtags
    sentence = re.sub(r'(#[a-zA-Z0-9])\w*', '', sentence)
    #Remove tickers
    sentence = re.sub(r'\$[a-zA-Z0-9]*', '', sentence)
    #Remove numbers
    sentence = re.sub(r'[0-9]*','',sentence)
    

    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', sentence.lower())
    sentence = re.sub('[\W]+', ' ', sentence.lower()) +        ' '.join(emoticons).replace('-', '')
    
    stop_words = stopwords.words('english') + list(punctuation)
    words = word_tokenize(sentence)
    words = [w.lower() for w in words]
    filtered = [w for w in words if w not in stop_words and not w.isdigit()]
    words = {}
    for word in filtered:
        if word in words:
            words[word] += 1.0
        else:
            words[word] = 1.0
    return words

print("checkmark 3")

checkmark 3


In [9]:
# Vectorize the features function
features = np.vectorize(features)
# Extract the features for the whole dataset
X = features(tweets[:, 1])
# Set the targets
y = tweets[:, 0]

print("checkmark 4")

checkmark 4


In [11]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

# Create grid search
clf = Pipeline([("dct", DictVectorizer()), ("svc", LinearSVC())])
params = {
    "svc__C": [1e15, 1e13, 1e11, 1e9, 1e7, 1e5, 1e3, 1e1, 1e-1, 1e-3, 1e-5]
}
gs = GridSearchCV(clf, params, cv=10, verbose=1, n_jobs=-1, refit=True)
gs.fit(X, y)
model = gs.best_estimator_

# Print results
print(model.score(X, y))
print("Optimized parameters: ", model)
print("Best CV score: ", gs.best_score_)

Fitting 10 folds for each of 11 candidates, totalling 110 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.3s


0.8382441371016236
Optimized parameters:  Pipeline(memory=None,
     steps=[('dct', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)), ('svc', LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])
Best CV score:  0.7372218881539386


[Parallel(n_jobs=-1)]: Done 110 out of 110 | elapsed:    4.4s finished


In [12]:
import coremltools

# Convert to CoreML model
coreml_model = coremltools.converters.sklearn.convert(model)
coreml_model.author = 'Cameron Deardorff'
coreml_model.license = 'MIT'
coreml_model.short_description = 'Sentiment polarity LinearSVC.'
coreml_model.input_description['input'] = 'Features extracted from the text.'
coreml_model.output_description['classLabel'] = 'The most likely polarity (positive/neutral/negative), for the given input.'
coreml_model.output_description['classProbability'] = 'The probabilities for each class label, for the given input.'
coreml_model.save('SentimentPolarity.mlmodel')

In [None]:
# a.dist
# Best CV score:  0.8073365433350228