In [1]:
import json
import re
import pandas as pd
import numpy as np
import random
import nltk
import codecs
import io
import pickle
import joblib
import seaborn as sns

from textblob import TextBlob
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#Extracting features from text, define target y and data X
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics


%matplotlib inline

# Let's improve the models.

We have three models that we want to improve. An idea for cleaning the MNB:
- apply the nltk TweetTokenizer function to see if this tokenization method improves model.
- use this method of tokenizing for every model.
- apply gridsearch to each model.
- use the Pipeline feature and repickle each model with the pipeline applied.

THEN I'll feel comfortable applying this to unseen data.

What if we read in all the raw files that I originally used to put together the master file, and then applied the tokenization AFTER combining them?

### Short movie reviews

In [2]:
# Transform

short_pos = io.open("../data/training_data/short_positive_movie_reviews.txt", "r", encoding="latin-1").read()
short_neg = io.open("../data/training_data/short_negative_movie_reviews.txt", "r", encoding="latin-1").read()

all_reviews = []

for i in short_pos.split('\n'):
    all_reviews.append((i, "positive"))
    
for i in short_neg.split('\n'):
    all_reviews.append((i, "negative"))
    
print(len(all_reviews))

smr_df = pd.DataFrame(all_reviews, columns=['text', 'sentiment'])
# shuffle dataframe
smr_df = smr_df.sample(frac=1).reset_index(drop=True)
smr_df['source'] = 'short_movie_reviews'
print(smr_df.shape)
smr_df.sample(10)

10662
(10662, 3)


Unnamed: 0,text,sentiment,source
9145,"williams plays sy , another of his open-faced ...",positive,short_movie_reviews
7422,"this delicately observed story , deeply felt a...",positive,short_movie_reviews
10244,"expect the same-old , lame-old slasher nonsens...",negative,short_movie_reviews
2150,the story that emerges has elements of romance...,positive,short_movie_reviews
8612,tom green and an ivy league college should nev...,negative,short_movie_reviews
1312,despite terrific special effects and funnier g...,negative,short_movie_reviews
8567,let's cut to the consumer-advice bottom line :...,negative,short_movie_reviews
9045,it made me want to wrench my eyes out of my he...,negative,short_movie_reviews
3949,"no , it's not as single-minded as john carpent...",positive,short_movie_reviews
5380,"it's clear why deuces wild , which was shot tw...",negative,short_movie_reviews


In [3]:
smr_df.sentiment.value_counts()

negative    5331
positive    5331
Name: sentiment, dtype: int64

#### Sklearn

In [4]:
# # let's do the damn thang
# # create a function, maybe add a parameter to account for max_features

# def sklearn_naive_bayes(train, cat, n_features):
#     vect = TfidfVectorizer(ngram_range=(1, 2), max_features=n_features)
#     X = vect.fit_transform(train)
#     y = cat

#     #Partitioning the data into test and training set
#     SPLIT_PERC = 0.75
#     split_size = int(len(y)*SPLIT_PERC)

#     X_train = X[:split_size]
#     X_test = X[split_size:]
#     y_train = y[:split_size]
#     y_test = y[split_size:]

#     #Training the model
#     clf = MultinomialNB()
#     clf.fit(X_train, y_train)

#     #Evaluating the results
#     print("Accuracy on training set:")
#     print(clf.score(X_train, y_train))
#     print("Accuracy on testing set:")
#     print(clf.score(X_test, y_test))
#     y_pred = clf.predict(X_test)
#     print("Classification Report:")
#     print(metrics.classification_report(y_test, y_pred))

#### NLTK
Note that I'm only using unigrams for NLTK cuz I haven't figured out how to use bigrams with it

In [5]:
# def nltk_naive_bayes(df, n_features):

#     # split the cleaned_tweets column
#     df['split_tweets'] = df.cleaned_tweets.apply(lambda x: x.split())
#     all_reviews = zip(df.split_tweets, df.sentiment)

#     all_tweets = ' '.join(df['cleaned_tweets']).split()

#     freq_tweets = nltk.FreqDist(all_tweets)
#     print('The top 50 most common words are:')
#     print(freq_tweets.most_common(50))

#     word_features = [w[0] for w in freq_tweets.most_common(n_features)]
#     # as we can see, there are a lot of stop words in this list. we should consider
#     # removing the stop words to get better word features, but for now let's leave them in.
    
#     def find_features(document):
#         words = set(document)
#         features = {}
#         for w in word_features:
#             features[w] = (w in words)

#         return features

#     featuresets = [(find_features(rev), category) for (rev, category) in all_reviews]

#     training_set = featuresets[:int(len(featuresets)*.75)]
#     testing_set = featuresets[int(len(featuresets)*.75):]


#     # This is the algorithm calculation:
#     # posterior = prior occurences x likelihood / evidence
#     # the above gives us the likelihood of something to be positive (or negative). It's not the best algorithm,
#     # but it's scalable and easy to use

#     classifier = nltk.NaiveBayesClassifier.train(training_set)
#     print("Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set))*100)
#     classifier.show_most_informative_features(25)

#### GREAT! OK. Let's make some functions so we can reproduce these steps with the other datasets

It might be worth doing some sort of gridsearch to find the point of diminishing returns for number of features to include in your model

### Airline reviews

In [6]:
# let's take a look at another dataset that we might be able to train with
airlines = pd.read_csv('../data/training_data/airline_reviews.csv')
print(airlines.columns)
airlines_df = airlines[['text', 'airline_sentiment']]
airlines_df.columns = ['text', 'sentiment']
airlines_df['source'] = 'airline_reviews'
print(airlines_df.shape)
airlines_df.sample(10)

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')
(14640, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,text,sentiment,source
10849,"@USAirways Thanks. No DC yet, I see. I will ke...",positive,airline_reviews
4782,@SouthwestAir Are flights going into Dallas ...,neutral,airline_reviews
13888,@AmericanAir @USAirways how can you have no fo...,negative,airline_reviews
2871,@united Still waiting on our bag! Never got de...,negative,airline_reviews
5728,@SouthwestAir... I love you. Air travel doesn'...,positive,airline_reviews
11135,@USAirways we bought our tickets months ago. H...,negative,airline_reviews
12448,@AmericanAir I'm flying into DCA my bag is at ...,negative,airline_reviews
189,@VirginAmerica you will match my #AmericanAirl...,positive,airline_reviews
13267,@AmericanAir Can you add my KTN to an existing...,negative,airline_reviews
6448,@SouthwestAir I hope you're happy! You have of...,negative,airline_reviews


In [7]:
airlines_df.sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: sentiment, dtype: int64

In [8]:
# # clean tweet with function i found online
# def clean_tweet(tweet): 
#     ''' 
#     Utility function to clean tweet text by removing links, special characters 
#     using simple regex statements. 
#     '''
#     x = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) | (\w+:\/\/\S+)", " ", tweet).split())
#     cleaned = re.sub(r'[^\x00-\x7f]',r'', re.sub(r'https:\/\/t.co\/[A-z0-9]*', '', x))
#     #take out the hashtags
#     without_hashtags = re.sub(r'[^\x00-\x7f]',r'', re.sub(r'\#[A-z0-9]*', '', cleaned))
#     # or you can remove it by applying: .decode('utf8').encode('ascii', errors='ignore')
#     return without_hashtags

# # tokenize and shit
# def tokenize_and_stem(row):
    
#     # remove punctuations
#     i = re.sub(r'[^\w\s]','',row)    
#     # tokenize the words in the short reviews first
#     words = word_tokenize(i)
#     # remove stop words
#     filtered = [ps.stem(w) for w in words if not w in stop_words]
#     return ' '.join(filtered)



### Yelp Reviews

In [9]:
yelp = pd.read_csv('../data/training_data/yelp_labelled.txt', sep="\t", header=None)
yelp.columns = ['text', 'category']
yelp['source'] = 'yelp'
yelp['sentiment'] = np.where(yelp.category == 1, 'positive', 'negative')
yelp_df = yelp[['text', 'sentiment', 'source']]
yelp_df.sample(10)

Unnamed: 0,text,sentiment,source
356,"Sadly, Gordon Ramsey's Steak is a place we sha...",negative,yelp
842,"I ate there twice on my last visit, and especi...",positive,yelp
884,On three different occasions I asked for well ...,negative,yelp
925,"I could barely stomach the meal, but didn't co...",negative,yelp
965,It's NOT hard to make a decent hamburger.,negative,yelp
502,Cant say enough good things about this place.,positive,yelp
244,"like the other reviewer said ""you couldn't pay...",negative,yelp
899,"Overall, a great experience.",positive,yelp
50,We ordered the duck rare and it was pink and t...,positive,yelp
687,"The decor is nice, and the piano music soundtr...",positive,yelp


In [10]:
yelp_df.sentiment.value_counts()

positive    500
negative    500
Name: sentiment, dtype: int64

### Amazon

In [11]:
amazon = pd.read_csv('../data/training_data/amazon_cells_labelled.txt', 
                   sep="\t", header=None)
amazon.columns = ['text', 'category']
amazon['sentiment'] = np.where(amazon.category == 1, 'positive', 'negative')
amazon['source'] = 'amazon'
amazon_df = amazon[['text', 'sentiment', 'source']]
print(amazon_df.shape)
amazon_df.sample(10)

(1000, 3)


Unnamed: 0,text,sentiment,source
374,Not a good item.. It worked for a while then s...,negative,amazon
652,The earpiece on this is too large or too heavy...,negative,amazon
106,That's a huge design flaw (unless I'm not usin...,negative,amazon
542,"Perhaps my phone is defective, but people cann...",negative,amazon
338,"It was an inexpensive piece, but I would still...",negative,amazon
134,Buyer--Be Very Careful!!!!!.,negative,amazon
941,I have tried these cables with my computer and...,positive,amazon
295,This is hands down the best phone I've ever had.,positive,amazon
862,So far it has worked like a charm.,positive,amazon
292,Excellent Phone.,positive,amazon


In [12]:
amazon_df.sentiment.value_counts()

positive    500
negative    500
Name: sentiment, dtype: int64

### Stanford dataset

- 0: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
- 1: the id of the tweet (2087)
- 2: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
- 3: the query (lyx). If there is no query, then this value is NO_QUERY.
- 4: the user that tweeted (robotickilldozr)
- 5: the text of the tweet (Lyx is cool)

In [13]:
# and then this random training set from airlines
stanford = pd.read_csv('../data/training_data/stanford_training_labelled.csv', header=None, encoding='ISO-8859-1')
stanford.columns = ['polarity', 'tweet_id', 'date', 'query', 'user', 'text']
stanford['sentiment'] = np.where(stanford.polarity == 0, 'negative', 
                                 np.where(stanford.polarity == 2, 'neutral', 'positive'))
stanford['source'] = 'stanford'
stanford_df = stanford[['text', 'sentiment', 'source']]
print(stanford_df.shape)
stanford_df.sample(10)

(1600000, 3)


Unnamed: 0,text,sentiment,source
736874,Trying to study for the upcoming exams but I a...,negative,stanford
1518222,I think i'm slowly beginning to cheer up. It's...,positive,stanford
923037,TODAY ONLY! - 20% off everything for Mother's ...,positive,stanford
664937,If I'd waited ONE and a HALF minutes to leave ...,negative,stanford
178545,in a very stress mood,negative,stanford
1557273,"&quot;When you came in the air went out, and e...",positive,stanford
1210243,Good night... What a day! What a party!,positive,stanford
328334,is sick of having bad days at work. I don't kn...,negative,stanford
115383,In Alexandria. Almost home So much to do befo...,negative,stanford
281759,"@BrandyWandLover dammit, can't play it on this...",negative,stanford


In [14]:
stanford_df.sentiment.value_counts()

positive    800000
negative    800000
Name: sentiment, dtype: int64

In [15]:
stanford_df.head(25)

Unnamed: 0,text,sentiment,source
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,stanford
1,is upset that he can't update his Facebook by ...,negative,stanford
2,@Kenichan I dived many times for the ball. Man...,negative,stanford
3,my whole body feels itchy and like its on fire,negative,stanford
4,"@nationwideclass no, it's not behaving at all....",negative,stanford
5,@Kwesidei not the whole crew,negative,stanford
6,Need a hug,negative,stanford
7,@LOLTrish hey long time no see! Yes.. Rains a...,negative,stanford
8,@Tatiana_K nope they didn't have it,negative,stanford
9,@twittera que me muera ?,negative,stanford


### Wikipedia dataset

In [16]:
lines = []

with open('../data/wikisent2.txt', encoding='utf-8') as f:
    for line in f:
        lines.append(line[:-2])

lines[:10]

['0.000123, which corresponds to a distance of 705 Mly, or 216 Mpc',
 '000webhost is a free web hosting service, operated by Hostinger',
 '0010x0010 is a Dutch-born audiovisual artist, currently living in Los Angeles',
 '0-0-1-3 is an alcohol abuse prevention program developed in 2004 at Francis E. Warren Air Force Base based on research by the National Institute on Alcohol Abuse and Alcoholism regarding binge drinking in college students',
 '0.01 is the debut studio album of H3llb3nt, released on February 20, 1996 by Fifth Colvmn Records',
 '001 of 3 February 1997, which was signed between the Government of the Republic of Rwanda, and FAPADER',
 '003230 is a South Korean food manufacturer',
 '0.04%Gas molecules in soil are in continuous thermal motion according to the kinetic theory of gasses, there is also collision between molecules - a random walk',
 '0.04% of the votes were invalid',
 '005.1999.06 is the fifth studio album by the South Korean singer and actress Uhm Jung-hwa']

In [17]:
wiki_df = pd.DataFrame({'text':lines,
                        'sentiment':'neutral',
                        'source':'wikipedia'})

print(wiki_df.shape)
wiki_df.head()

(7871825, 3)


Unnamed: 0,text,sentiment,source
0,"0.000123, which corresponds to a distance of 7...",neutral,wikipedia
1,"000webhost is a free web hosting service, oper...",neutral,wikipedia
2,"0010x0010 is a Dutch-born audiovisual artist, ...",neutral,wikipedia
3,0-0-1-3 is an alcohol abuse prevention program...,neutral,wikipedia
4,"0.01 is the debut studio album of H3llb3nt, re...",neutral,wikipedia


In [18]:
np.random.seed(24)
wiki_sample = wiki_df.sample(400000)
wiki_sample.sample(10)

Unnamed: 0,text,sentiment,source
4017383,McGregor's work was rooted in motivation theor...,neutral,wikipedia
3169681,It is the first wuxia television series to fil...,neutral,wikipedia
6168424,The Indiana Business Bulletin provides weekly ...,neutral,wikipedia
4794525,Romania and Moldova are Eastern Orthodox count...,neutral,wikipedia
595353,A true claw is made of hard protein called ker...,neutral,wikipedia
3503116,It was released in July 1965 by Blue Note Records,neutral,wikipedia
5164280,Some protists are related to animals and some ...,neutral,wikipedia
677615,Barry Stuart McDonald (born 9 June 1942) was a...,neutral,wikipedia
1619500,Furth and Sondheim retained the basic structur...,neutral,wikipedia
7519991,"Trachylepis boettgeri, commonly knowm as Boett...",neutral,wikipedia


### Append the all together!

Well, let's maybe append part of it together. I don't think we're gonna need all of the stanford dataset, maybe 1/2 of it. The models seem to do pretty good without a ton of data.

1. smr_df
2. airlines_df
3. yelp_df
5. amazon_df
6. stanford_df (50%)

In [19]:
master = (smr_df.append(airlines_df)
          .append(yelp_df)
          .append(amazon_df)
          .append(stanford_df.sample(frac=1).reset_index(drop=True)[:int(len(stanford_df)/2)])
          .append(wiki_sample)
          .reset_index(drop=True))

print(master.shape)
print(master.source.value_counts())
# shuffle dataset
master = master.sample(frac=1).reset_index(drop=True)
master.sample(10)

(1227302, 3)
stanford               800000
wikipedia              400000
airline_reviews         14640
short_movie_reviews     10662
amazon                   1000
yelp                     1000
Name: source, dtype: int64


Unnamed: 0,text,sentiment,source
447463,is chatting on msn to Jennypoo!! &lt;3 Shes li...,positive,stanford
1062506,bye for now everyone! it's raining so hard out...,positive,stanford
283133,@ALLIEINCREDIBLE got money on the mind :o) Sle...,positive,stanford
688908,Obama ladislavii is a species of Brazilian lan...,neutral,wikipedia
686414,My case came open today and dropped my new iPh...,negative,stanford
986570,"@emilywalkerr @justinlovescolt haha, okay, may...",positive,stanford
899448,"Cousin is gone have had a lovely weekend, bac...",negative,stanford
638663,Doon is also part of Shimla Lok Sabha constitu...,neutral,wikipedia
737824,wishes she had another day to revise for tomor...,negative,stanford
1007587,"Besides his books, he has written for numerous...",neutral,wikipedia


In [20]:
master.sentiment.value_counts()

negative    414895
positive    409308
neutral     403099
Name: sentiment, dtype: int64

# the dataset is imbalanced but let's go ahead and try to clean it all right now using nltk tweet tokenizer and then tune the models

In [21]:
# def remove_punctuations(row):
#     return re.sub(r'[^\w\s]','',row)

# Let's leave the punctuations in there for now.

In [22]:
from nltk.tokenize import TweetTokenizer
tokenizer = nltk.casual.TweetTokenizer(strip_handles=True, 
                                       preserve_case=False,
                                       reduce_len=True) 

from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer # let's not stem the words

stop_words = set(stopwords.words("english"))
# remove stop words
master['cleaned'] = master.text.apply(lambda x: [w for w in tokenizer.tokenize(x) if not w in stop_words])
master.sample(10)                          

Unnamed: 0,text,sentiment,source,cleaned
344154,According to the photographer of this photogra...,neutral,wikipedia,"[according, photographer, photograph, ,, also,..."
307513,will not sleep early tonight.. i LOVE programm...,negative,stanford,"[sleep, early, tonight, .., love, programming,..."
833636,Why can I not sleep in anymore despite going t...,positive,stanford,"[sleep, anymore, despite, going, bed, 4, hours..."
1164366,It is very loosely based on the true life stor...,neutral,wikipedia,"[loosely, based, true, life, story, newton, kn..."
1015286,"@tommcfly aww! poor Harry..! hahaa, did yo...",negative,stanford,"[aww, !, poor, harry, .., !, hahaa, ,, take, n..."
979202,I have a very talented girlfriend,positive,stanford,"[talented, girlfriend]"
538544,He was the only one in his senior class,neutral,wikipedia,"[one, senior, class]"
600205,"@ home, totally pissed off, don`t wanna hear o...",negative,stanford,"[@, home, ,, totally, pissed, ,, `, wanna, hea..."
692659,Development began in 2011 after the studio com...,neutral,wikipedia,"[development, began, 2011, studio, completed, ..."
73199,"Released on June 19, 2000 as a lower-cost offe...",neutral,wikipedia,"[released, june, 19, ,, 2000, lower-cost, offe..."


# rerun models

# Logistic Regression

In [23]:
from sklearn.model_selection import train_test_split

X, y = master.text, master.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [24]:
from nltk.tokenize import TweetTokenizer
tokenizer = nltk.casual.TweetTokenizer(strip_handles=True, 
                                       preserve_case=False,
                                       reduce_len=True) 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

count_vect = CountVectorizer(tokenizer=tokenizer.tokenize) 
classifier = LogisticRegression()

In [25]:
sentiment_pipeline = Pipeline([
        ('vectorizer', count_vect),
        ('classifier', classifier)
    ])

In [28]:
from pandas_confusion import ConfusionMatrix
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
import numpy as np
import re

def genericize_mentions(text):
    return re.sub(r'@[\w_-]+', 'thisisanatmention', text)

def get_tweet_length(text):
    return len(text)

def pipelinize(function, active=True):
    def list_comprehend_a_function(list_or_series, active=True):
        if active:
            return [function(i) for i in list_or_series]
        else: # if it's not active, just pass it right back
            return list_or_series
    return FunctionTransformer(list_comprehend_a_function, validate=False, kw_args={'active':active})

def reshape_a_feature_column(series):
    return np.reshape(np.asarray(series), (len(series), 1))

def pipelinize_feature(function, active=True):
    def list_comprehend_a_function(list_or_series, active=True):
        if active:
            processed = [function(i) for i in list_or_series]
            processed = reshape_a_feature_column(processed)
            return processed
#         This is incredibly stupid and hacky, but we need it to do a grid search.
#         If a feature is deactivated, we're going to just return a column of zeroes.
#         Zeroes shouldn't affect the regression, but other values may.
#         If you really want brownie points, consider pulling out that feature column later in the pipeline.
        else:
            return reshape_a_feature_column(np.zeros(len(list_or_series)))

    return FunctionTransformer(list_comprehend_a_function, validate=False, kw_args={'active':active})

def display_null_accuracy(y_test):
    value_counts = pd.value_counts(y_test)
    null_accuracy = max(value_counts) / float(len(y_test))
    print('null accuracy: %s' % '{:.2%}'.format(null_accuracy))
    return null_accuracy

def display_accuracy_score(y_test, y_pred_class):
    score = accuracy_score(y_test, y_pred_class)
    print('accuracy score: %s' % '{:.2%}'.format(score))
    return score

def display_accuracy_difference(y_test, y_pred_class):
    null_accuracy = display_null_accuracy(y_test)
    accuracy_score = display_accuracy_score(y_test, y_pred_class)
    difference = accuracy_score - null_accuracy
    if difference > 0:
        print('model is %s more accurate than null accuracy' % '{:.2%}'.format(difference))
    elif difference < 0:
        print('model is %s less accurate than null accuracy' % '{:.2%}'.format(abs(difference)))
    elif difference == 0:
        print('model is exactly as accurate as null accuracy')
    return null_accuracy, accuracy_score

def train_test_and_evaluate(pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    y_pred_class = pipeline.predict(X_test)
    confusion_matrix = ConfusionMatrix(list(y_test), list(y_pred_class))
    display_accuracy_difference(y_test, y_pred_class)
    print('-' * 75 + '\nConfusion Matrix\n')
    print(confusion_matrix)
    print('-' * 75 + '\nClassification Report\n')
    print(metrics.classification_report(y_test, y_pred_class))
      
    return pipeline, confusion_matrix

In [29]:
sentiment_pipeline, confusion_matrix = train_test_and_evaluate(sentiment_pipeline, X_train, y_train, X_test, y_test)


null accuracy: 33.78%
accuracy score: 85.48%
model is 51.70% more accurate than null accuracy
---------------------------------------------------------------------------
Confusion Matrix

Predicted  negative  neutral  positive  __all__
Actual                                         
negative      81153      845     21656   103654
neutral         666    98843       989   100498
positive      19247     1142     82285   102674
__all__      101066   100830    104930   306826
---------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

    negative       0.80      0.78      0.79    103654
     neutral       0.98      0.98      0.98    100498
    positive       0.78      0.80      0.79    102674

   micro avg       0.85      0.85      0.85    306826
   macro avg       0.86      0.86      0.86    306826
weighted avg       0.85      0.85      0.85    306826



In [30]:
joblib.dump(sentiment_pipeline, '../pickle_files/vectorizer_and_logreg.pkl')

['../pickle_files/vectorizer_and_logreg.pkl']

# Adding a custom function to a pipeline

TODO: Start here: https://ryan-cranfill.github.io/sentiment-pipeline-sklearn-3/

What features might make sense to add? Do we want to add tweet length? Should we just stick with our text vectorizer for now? I say stick with what we have for now and see how it performs, we can iterate later if we want to. MVP MVP MVP!!

In [31]:
def get_tweet_length(text):
    return len(text)

def reshape_a_feature_column(series):
    return np.reshape(np.asarray(series), (len(series), 1))

def pipelinize_feature(function, active=True):
    def list_comprehend_a_function(list_or_series, active=True):
        if active:
            processed = [function(i) for i in list_or_series]
            processed = reshape_a_feature_column(processed)
            return processed
#         This is incredibly stupid and hacky, but we need it to do a grid search with activation/deactivation.
#         If a feature is deactivated, we're going to just return a column of zeroes.
#         Zeroes shouldn't affect the regression, but other values may.
#         If you really want brownie points, consider pulling out that feature column later in the pipeline.
        else:
            return reshape_a_feature_column(np.zeros(len(list_or_series)))

In [32]:
from sklearn.pipeline import FeatureUnion, Pipeline


sentiment_pipeline = Pipeline([
        ('genericize_mentions', pipelinize(genericize_mentions, active=True)),
        ('features', FeatureUnion([
                    ('vectorizer', count_vect),
                    ('post_length', pipelinize_feature(get_tweet_length, active=True))
                ])),
        ('classifier', classifier)
    ])

sentiment_pipeline, confusion_matrix = train_test_and_evaluate(sentiment_pipeline, X_train, y_train, X_test, y_test)

null accuracy: 33.78%
accuracy score: 85.61%
model is 51.83% more accurate than null accuracy
---------------------------------------------------------------------------
Confusion Matrix

Predicted  negative  neutral  positive  __all__
Actual                                         
negative      81208      823     21623   103654
neutral         637    99032       829   100498
positive      19217     1017     82440   102674
__all__      101062   100872    104892   306826
---------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

    negative       0.80      0.78      0.79    103654
     neutral       0.98      0.99      0.98    100498
    positive       0.79      0.80      0.79    102674

   micro avg       0.86      0.86      0.86    306826
   macro avg       0.86      0.86      0.86    306826
weighted avg       0.86      0.86      0.86    306826



# GridSearchCV

...is not working. try it on a Google Colab notebook using the GPU. Skip it for now cuz i'm being impatient at the moment.

In [33]:
from sklearn.model_selection import GridSearchCV

tokenizer_lowercase = nltk.casual.TweetTokenizer(preserve_case=False, reduce_len=False)
tokenizer_lowercase_reduce_len = nltk.casual.TweetTokenizer(preserve_case=False, reduce_len=True)
tokenizer_uppercase = nltk.casual.TweetTokenizer(preserve_case=True, reduce_len=False)
tokenizer_uppercase_reduce_len = nltk.casual.TweetTokenizer(preserve_case=True, reduce_len=True)

# Our parameter dictionary
# You access parameters by giving the dictionary keys of <featurename>__<parameter>
# The values of each keys are a list of values that you want to test

parameters = {
    'genericize_mentions__kw_args': [{'active':False}, {'active':True}], # genericizing mentions on/off
    'features__vectorizer__ngram_range': [(1,1), (1,2)], # ngram range of tokenizer
    'features__vectorizer__tokenizer': [tokenizer_lowercase.tokenize, # differing parameters for the TweetTokenizer
                                        tokenizer_lowercase_reduce_len.tokenize,
                                        tokenizer_uppercase.tokenize,
                                        tokenizer_uppercase_reduce_len.tokenize,
                                        None], # None will use the default tokenizer
    'features__vectorizer__max_df': [0.25, 0.5], # maximum document frequency for the CountVectorizer
    'classifier__C': np.logspace(-1, 0, 1) # C value for the LogisticRegression
}

grid = GridSearchCV(sentiment_pipeline, parameters, verbose=1)

In [34]:
grid, confusion_matrix = train_test_and_evaluate(grid, X_train, y_train, X_test, y_test)



Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

### TRY DOING THE ABOVE IN A GOOGLE NOTEBOOK - IT MIGHT BE FASTER WITH MULTI PROCESSING.

### This is already saved to a pickle file. So let's do the same to the other models.

# Multinomial Naive Bayes

In [35]:
tokenizer = nltk.casual.TweetTokenizer(strip_handles=True, 
                                       preserve_case=False,
                                       reduce_len=True) 

tdidf_vect = TfidfVectorizer(ngram_range=(1, 2),
                             max_features=5000,
                             tokenizer=tokenizer.tokenize)

mnb_classifier = MultinomialNB()

sentiment_mnb_pipeline = Pipeline([
    ('vectorizer', tdidf_vect),
    ('classifier', mnb_classifier)
])

In [36]:
sentiment_mnb_pipeline, confusion_matrix = train_test_and_evaluate(sentiment_mnb_pipeline, X_train, y_train, X_test, y_test)


null accuracy: 33.78%
accuracy score: 82.72%
model is 48.94% more accurate than null accuracy
---------------------------------------------------------------------------
Confusion Matrix

Predicted  negative  neutral  positive  __all__
Actual                                         
negative      80761     1635     21258   103654
neutral        1078    98259      1161   100498
positive      25452     2438     74784   102674
__all__      107291   102332     97203   306826
---------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

    negative       0.75      0.78      0.77    103654
     neutral       0.96      0.98      0.97    100498
    positive       0.77      0.73      0.75    102674

   micro avg       0.83      0.83      0.83    306826
   macro avg       0.83      0.83      0.83    306826
weighted avg       0.83      0.83      0.83    306826



In [37]:
# pickle the pipeline
joblib.dump(sentiment_mnb_pipeline, '../pickle_files/vectorizer_and_mnb.pkl')

['../pickle_files/vectorizer_and_mnb.pkl']

# Linear SGD Classification Model

In [38]:
from sklearn.linear_model import SGDClassifier

tokenizer = nltk.casual.TweetTokenizer(strip_handles=True, 
                                       preserve_case=False,
                                       reduce_len=True) 

tdidf_vect = TfidfVectorizer(ngram_range=(1, 2),
                             max_features=5000,
                             tokenizer=tokenizer.tokenize)

sgd_model = SGDClassifier(max_iter=1000)

sentiment_sgd_pipeline = Pipeline([
    ('vectorizer', tdidf_vect),
    ('classifier', sgd_model)
])

In [39]:
sentiment_sgd_pipeline, confusion_matrix = train_test_and_evaluate(sentiment_sgd_pipeline, X_train, y_train, X_test, y_test)




null accuracy: 33.78%
accuracy score: 83.32%
model is 49.53% more accurate than null accuracy
---------------------------------------------------------------------------
Confusion Matrix

Predicted  negative  neutral  positive  __all__
Actual                                         
negative      80007     2257     21390   103654
neutral         677    99076       745   100498
positive      23041     3079     76554   102674
__all__      103725   104412     98689   306826
---------------------------------------------------------------------------
Classification Report

              precision    recall  f1-score   support

    negative       0.77      0.77      0.77    103654
     neutral       0.95      0.99      0.97    100498
    positive       0.78      0.75      0.76    102674

   micro avg       0.83      0.83      0.83    306826
   macro avg       0.83      0.83      0.83    306826
weighted avg       0.83      0.83      0.83    306826



In [40]:
# pickle the pipeline
joblib.dump(sentiment_sgd_pipeline, '../pickle_files/vectorizer_and_sgd.pkl')

['../pickle_files/vectorizer_and_sgd.pkl']

# Apply to the real world notebook.