In [2]:
import json
import re
import pandas as pd
import numpy as np
import random
import nltk
import codecs
import io
import pickle
import joblib

from pandas_confusion import ConfusionMatrix
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#Extracting features from text, define target y and data X
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn import metrics


# Read in the pickle files

In [3]:
#Loading the saved MNB, LOGREG, SGD models with joblib
classifier_f1 = open("../pickle_files/vectorizer_and_mnb.pkl", "rb")
mnb_pipeline = joblib.load(classifier_f1)
classifier_f1.close()

classifier_f2 = open("../pickle_files/vectorizer_and_logreg.pkl", "rb")
log_pipeline = joblib.load(classifier_f2)
classifier_f2.close()

classifier_f3 = open("../pickle_files/vectorizer_and_sgd.pkl", "rb")
sgd_pipeline = joblib.load(classifier_f3)
classifier_f3.close()

# Build a Vote Classifier. We'll want to apply the 3 models to the input text, and if they all agree that the sentiment is positive or negative, we'll take the next step, which is maybe measuring _how_ negative/positive the text is. Then, based on the NLTK sentiment score, we'll determine what our reply should be.


From the NLTK notes:

Now that we have many classifiers, what if we created a new classifier, which combined the votes of all of the classifiers, and then classified the text whatever the majority vote was?

Turns out, doing this is super easy. NLTK has considered this in advance, allowing us to inherit from their ClassifierI class from nltk.classify, which will give us the attributes of a classifier, yet allow us to write our own custom classifier code.

# Helper functions 

(Consider saving these to another file)

In [4]:
def genericize_mentions(text):
    return re.sub(r'@[\w_-]+', 'thisisanatmention', text)

def get_tweet_length(text):
    return len(text)

def pipelinize(function, active=True):
    def list_comprehend_a_function(list_or_series, active=True):
        if active:
            return [function(i) for i in list_or_series]
        else: # if it's not active, just pass it right back
            return list_or_series
    return FunctionTransformer(list_comprehend_a_function, validate=False, kw_args={'active':active})

def reshape_a_feature_column(series):
    return np.reshape(np.asarray(series), (len(series), 1))

def pipelinize_feature(function, active=True):
    def list_comprehend_a_function(list_or_series, active=True):
        if active:
            processed = [function(i) for i in list_or_series]
            processed = reshape_a_feature_column(processed)
            return processed
#         This is incredibly stupid and hacky, but we need it to do a grid search.
#         If a feature is deactivated, we're going to just return a column of zeroes.
#         Zeroes shouldn't affect the regression, but other values may.
#         If you really want brownie points, consider pulling out that feature column later in the pipeline.
        else:
            return reshape_a_feature_column(np.zeros(len(list_or_series)))

    return FunctionTransformer(list_comprehend_a_function, validate=False, kw_args={'active':active})

def display_null_accuracy(y_test):
    value_counts = pd.value_counts(y_test)
    null_accuracy = max(value_counts) / float(len(y_test))
    print('null accuracy: %s' % '{:.2%}'.format(null_accuracy))
    return null_accuracy

def display_accuracy_score(y_test, y_pred_class):
    score = accuracy_score(y_test, y_pred_class)
    print('accuracy score: %s' % '{:.2%}'.format(score))
    return score

def display_accuracy_difference(y_test, y_pred_class):
    null_accuracy = display_null_accuracy(y_test)
    accuracy_score = display_accuracy_score(y_test, y_pred_class)
    difference = accuracy_score - null_accuracy
    if difference > 0:
        print('model is %s more accurate than null accuracy' % '{:.2%}'.format(difference))
    elif difference < 0:
        print('model is %s less accurate than null accuracy' % '{:.2%}'.format(abs(difference)))
    elif difference == 0:
        print('model is exactly as accurate as null accuracy')
    return null_accuracy, accuracy_score

def train_test_and_evaluate(pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    y_pred_class = pipeline.predict(X_test)
    confusion_matrix = ConfusionMatrix(list(y_test), list(y_pred_class))
    display_accuracy_difference(y_test, y_pred_class)
    print('-' * 75 + '\nConfusion Matrix\n')
    print(confusion_matrix)
    print('-' * 75 + '\nClassification Report\n')
    print(metrics.classification_report(y_test, y_pred_class))
      
    return pipeline, confusion_matrix

# Read in unseen data

In [5]:
# this testing set it's gonna work. But why don't we try a testing set
# that we know will work? Let's use an evaluation dataset.
from nltk.tokenize import TweetTokenizer
tokenizer = nltk.casual.TweetTokenizer(strip_handles=True, 
                                       preserve_case=False,
                                       reduce_len=True) 

def remove_punctuations(row):
    return re.sub(r'[^\w\s]','',row)

tweets = []
for line in open('../data/NYCTtweets_feb_forward.json', 'r'):
    tweets.append(line)
    
print("The number of tweets is {}".format(len(tweets)))

def transform_json_to_df(tweets):
    dt = []
    tweet_id = []
    tweet_text = []
    screen_name = []
    number_of_followers = []
    in_reply_to = []

    for counter, tweet in enumerate(tweets):
        # print(counter)
        if counter % 2 == 0:
            if json.loads(tweet).get('delete') == None:
                dt.append(json.loads(tweet).get('created_at'))
                tweet_id.append(json.loads(tweet).get('id_str'))
                tweet_text.append(json.loads(tweet).get('text'))
                screen_name.append(json.loads(tweet)['user'].get('screen_name'))
                in_reply_to.append(json.loads(tweet).get('in_reply_to_screen_name'))
        else:
            continue

    df = pd.DataFrame({'dt': dt,
                      'tweet_id': tweet_id,
                      'tweet_text': tweet_text,
                      'screen_name': screen_name,
                      'in_reply_to': in_reply_to})
    
    return df

stop_words = set(stopwords.words("english"))

eval_df = transform_json_to_df(tweets)
# convert the tweet_text from unicode to str
eval_df['eval_text'] = eval_df.tweet_text.apply(lambda x: [w for w in tokenizer.tokenize(x) if not w in stop_words])
# remove the rt's
eval_df['eval_text'] = eval_df.eval_text.apply(lambda x: [e for e in x if e not in '\n'.join(['rt'])])
eval_df['predict_this'] = eval_df.eval_text.apply(lambda x: str(' '.join(x)))
print(eval_df.shape)
eval_df.head()

The number of tweets is 158382
(78794, 7)


Unnamed: 0,dt,tweet_id,tweet_text,screen_name,in_reply_to,eval_text,predict_this
0,Thu Feb 21 04:14:11 +0000 2019,1098435691292311552,@NYCTSubway @MTA @MTA my train just went out o...,johnny__milani,johnny__milani,"[train, went, service, 168, ..., going, !, !, !]",train went service 168 ... going ! ! !
1,Thu Feb 21 04:16:00 +0000 2019,1098436147292688384,"RT @NYCTSubway: @ajrossnyc Hello, AJ. Which st...",DjShocker08,,"[:, hello, ,, aj, ., station, trying, head, ?, ^]",": hello , aj . station trying head ? ^"
2,Thu Feb 21 04:17:21 +0000 2019,1098436487245225984,RT @NYCTSubway: @NYCSubwayRider Hello. The upt...,DjShocker08,,"[:, hello, ., uptown, e, trains, arriving, f, ...",: hello . uptown e trains arriving f train pla...
3,Thu Feb 21 04:18:40 +0000 2019,1098436820277346304,"@TitaBonita5 Hello, Tita. Currently, due to th...",NYCTSubway,TitaBonita5,"[hello, ,, tita, ., currently, ,, due, planned...","hello , tita . currently , due planned work ef..."
4,Thu Feb 21 04:20:19 +0000 2019,1098437234586472449,@bonnalitaa Please keep in mind there is plann...,NYCTSubway,bonnalitaa,"[please, keep, mind, planned, work, going, hou...",please keep mind planned work going hours day ...


# Logistic Regression

In [6]:
predict_df = eval_df.loc[(eval_df.in_reply_to=='NYCTSubway')][['tweet_text', 'predict_this']]\
                  .reset_index(drop=True)

log_predictions = []

predict_df.apply(lambda x: log_predictions.append(log_pipeline.predict(x)))
# log_predictions = log_pipeline.predict(log_predictions)

print(len(log_predictions[0]))
predict_df['log_predictions'] = log_predictions[0]
predict_df.head()

30219


Unnamed: 0,tweet_text,predict_this,log_predictions
0,@NYCTSubway @nothisisthegame *investigation st...,* investigation starts * “ wow smells like pis...,negative
1,@NYCTSubway I need a local service!! Not expre...,need local service ! ! express train,negative
2,@NYCTSubway There were no announcements or Twi...,announcements twitter posts issue . random tra...,negative
3,@NYCTSubway It was in the 10am hour- that enou...,10am hour - enough time ?,positive
4,@NYCTSubway Almost all of the platforms in the...,almost platforms bronx 2/5 line,neutral


In [7]:
predict_df.head()

Unnamed: 0,tweet_text,predict_this,log_predictions
0,@NYCTSubway @nothisisthegame *investigation st...,* investigation starts * “ wow smells like pis...,negative
1,@NYCTSubway I need a local service!! Not expre...,need local service ! ! express train,negative
2,@NYCTSubway There were no announcements or Twi...,announcements twitter posts issue . random tra...,negative
3,@NYCTSubway It was in the 10am hour- that enou...,10am hour - enough time ?,positive
4,@NYCTSubway Almost all of the platforms in the...,almost platforms bronx 2/5 line,neutral


In [30]:
# predict_df.predict_this[0]
log_pipeline.predict([predict_df.predict_this[0]])[0]
# predict_df.iloc[1].apply(lambda x: print(x))

'negative'

# Multinomial Naive Bayes

In [149]:
mnb_predictions = []

predict_df.apply(lambda x: mnb_predictions.append(mnb_pipeline.predict(x)))
# log_predictions = log_pipeline.predict(log_predictions)

print(len(mnb_predictions[0]))
predict_df['mnb_predictions'] = mnb_predictions[0]
predict_df.head()

30219


Unnamed: 0,tweet_text,predict_this,log_predictions,mnb_predictions
0,@NYCTSubway @nothisisthegame *investigation st...,* investigation starts * “ wow smells like pis...,negative,positive
1,@NYCTSubway I need a local service!! Not expre...,need local service ! ! express train,negative,negative
2,@NYCTSubway There were no announcements or Twi...,announcements twitter posts issue . random tra...,negative,negative
3,@NYCTSubway It was in the 10am hour- that enou...,10am hour - enough time ?,positive,negative
4,@NYCTSubway Almost all of the platforms in the...,almost platforms bronx 2/5 line,neutral,neutral


# SGD Model

In [150]:
sgd_predictions = []

predict_df.apply(lambda x: sgd_predictions.append(sgd_pipeline.predict(x)))
# log_predictions = log_pipeline.predict(log_predictions)

print(len(sgd_predictions[0]))
predict_df['sgd_predictions'] = sgd_predictions[0]
predict_df.head()

30219


Unnamed: 0,tweet_text,predict_this,log_predictions,mnb_predictions,sgd_predictions
0,@NYCTSubway @nothisisthegame *investigation st...,* investigation starts * “ wow smells like pis...,negative,positive,positive
1,@NYCTSubway I need a local service!! Not expre...,need local service ! ! express train,negative,negative,negative
2,@NYCTSubway There were no announcements or Twi...,announcements twitter posts issue . random tra...,negative,negative,negative
3,@NYCTSubway It was in the 10am hour- that enou...,10am hour - enough time ?,positive,negative,positive
4,@NYCTSubway Almost all of the platforms in the...,almost platforms bronx 2/5 line,neutral,neutral,neutral


# Now apply the vote classifier. If they all have the same prediction, we will apply the NLTK Vader score to see just how positive/negative they are.

In [151]:
# # Take the models from above
# from nltk.classify import ClassifierI
# from statistics import mode

# class VoteClassifier(ClassifierI):
#     def __init__(self, *classifiers):
#         self._classifiers = classifiers
        
#     def classify(self, features):
#         votes = []
#         for c in self._classifiers:
#             v = c.classify(features)
#             votes.append(v)
#         return mode(votes)
    
#     def confidence(self, features):
#         votes = []
#         for c in self._classifiers:
#             v = c.classify(features)
#             votes.append(v)
            
#         choice_votes = votes.count(mode(votes))
#         conf = float(choice_votes) / len(votes)
#         return conf

In [152]:
def do_they_match(row):
    if row['mnb_predictions'] == row['log_predictions'] == row['sgd_predictions']:
        if row['mnb_predictions'] == 'negative':
            return 'all_negative'
        if row['mnb_predictions'] == 'positive':
            return 'all_positive'
    else:
        return 'no'

predict_df['do_they_agree'] = predict_df.apply(do_they_match, axis=1)
predict_df.head()

Unnamed: 0,tweet_text,predict_this,log_predictions,mnb_predictions,sgd_predictions,do_they_agree
0,@NYCTSubway @nothisisthegame *investigation st...,* investigation starts * “ wow smells like pis...,negative,positive,positive,no
1,@NYCTSubway I need a local service!! Not expre...,need local service ! ! express train,negative,negative,negative,all_negative
2,@NYCTSubway There were no announcements or Twi...,announcements twitter posts issue . random tra...,negative,negative,negative,all_negative
3,@NYCTSubway It was in the 10am hour- that enou...,10am hour - enough time ?,positive,negative,positive,no
4,@NYCTSubway Almost all of the platforms in the...,almost platforms bronx 2/5 line,neutral,neutral,neutral,


In [153]:
# ok, we have a column that labels if they agree. Let's see which ones these are:
predict_df.loc[predict_df.do_they_agree=='all_negative'].values

array([['@NYCTSubway I need a local service!! Not express train',
        'need local service ! ! express train', 'negative', 'negative',
        'negative', 'all_negative'],
       ['@NYCTSubway There were no announcements or Twitter posts about this issue. So there was just a random train hanging… https://t.co/A71Tr2SRKU',
        'announcements twitter posts issue . random train hanging … https://t.co/A71Tr2SRKU',
        'negative', 'negative', 'negative', 'all_negative'],
       ['@NYCTSubway why does the employees take over the seats on the train when it should seat three people not two  ?   T… https://t.co/TnnLrSXPIQ',
        'employees take seats train seat three people two ? … https://t.co/TnnLrSXPIQ',
        'negative', 'negative', 'negative', 'all_negative'],
       ...,
       ['@NYCTSubway Unfortunately I didn’t think to grab the car number. Instead, I sat crying on the 145th st platform wai… https://t.co/PdQkrWA8Ml',
        'unfortunately ’ think grab car number . inst

In [154]:
# lets apply the NLTK vader to get a sentiment score
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

sid.polarity_scores(pred_col.predict_this[30169])

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [155]:
# create  columns that contain compound, neg, neu, and pos kv's in the master dataframe
# then, let's plot a histogram to see the distrubution of scores across each
# negative and positive sentment that our naive bayes model has predicted.

predict_df['compound_score'] = predict_df.predict_this.apply(lambda x: sid.polarity_scores(x)['compound'])
predict_df['neg_score'] = predict_df.predict_this.apply(lambda x: sid.polarity_scores(x)['neg'])
predict_df['neu_score'] = predict_df.predict_this.apply(lambda x: sid.polarity_scores(x)['neu'])
predict_df['pos_score'] = predict_df.predict_this.apply(lambda x: sid.polarity_scores(x)['pos'])

predict_df.sample(10)

Unnamed: 0,tweet_text,predict_this,log_predictions,mnb_predictions,sgd_predictions,do_they_agree,compound_score,neg_score,neu_score,pos_score
9965,@NYCTSubway There always seems to be a brakes ...,always seems brakes emergency issue 36 st . mo...,negative,positive,positive,no,-0.3818,0.178,0.822,0.0
8791,@NYCTSubway And R and F and M and E and the wh...,f e whole thing,neutral,negative,neutral,no,0.0,0.0,1.0,0.0
28122,@NYCTSubway Yes I am😎,yes 😎,positive,positive,positive,all_positive,0.4019,0.0,0.0,1.0
18582,@NYCTSubway you're absolute trash and I can't ...,absolute trash can't wait til run town pitchforks,positive,positive,positive,all_positive,0.0,0.0,1.0,0.0
11717,@NYCTSubway RIP to that person. \n\nReally wis...,rip person . really wish fixing l old shutdown...,negative,negative,negative,all_negative,0.4576,0.0,0.7,0.3
26352,@NYCTSubway Are northbound trains running on l...,northbound trains running local express tracks...,neutral,negative,neutral,no,-0.0516,0.107,0.893,0.0
17901,@NYCTSubway Have you been able to fix and adju...,able fix adjust q morning commute trains ? ’ r...,neutral,negative,negative,no,0.0,0.0,1.0,0.0
20506,@NYCTSubway One arrived.\nBut the scheduling n...,one arrived . scheduling needs updated . 20 + ...,positive,negative,negative,no,0.0,0.0,1.0,0.0
29916,@NYCTSubway #NYCTSubway. Let us KNOW if Q trai...,#nyctsubway . let us know q train line cool co...,positive,positive,positive,all_positive,0.6808,0.0,0.663,0.337
28793,@NYCTSubway Any person anywhere: Rush hour fee...,person anywhere : rush hour feels like time wo...,negative,negative,negative,all_negative,0.4215,0.0,0.743,0.257


In [156]:
# let's check the scores for the all positive and all negative
predict_df.loc[predict_df.do_they_agree=='all_positive'].sample(10)

Unnamed: 0,tweet_text,predict_this,log_predictions,mnb_predictions,sgd_predictions,do_they_agree,compound_score,neg_score,neu_score,pos_score
27863,@NYCTSubway Yea yea yea thanks,yea yea yea thanks,positive,positive,positive,all_positive,0.4404,0.0,0.508,0.492
6096,"@NYCTSubway Thank you. But, what an embarrassm...","thank . , embarrassment . good lord . better w...",positive,positive,positive,all_positive,0.6597,0.159,0.385,0.456
27794,@NYCTSubway MyMTA. Thanks for looking in to it!,mymta . thanks looking !,positive,positive,positive,all_positive,0.4926,0.0,0.385,0.615
26868,@NYCTSubway Just in time for rush hour! Thanks...,time rush hour ! thanks guys !,positive,positive,positive,all_positive,0.5399,0.0,0.534,0.466
4433,@NYCTSubway We should strike against this nons...,strike nonsense .,positive,positive,positive,all_positive,-0.4939,1.0,0.0,0.0
20224,@NYCTSubway Hey. Thanks for the consideration,hey . thanks consideration,positive,positive,positive,all_positive,0.4404,0.0,0.408,0.592
24043,@NYCTSubway Good Afternoon. And I was at 14th ...,good afternoon . 14th street took transferred ...,positive,positive,positive,all_positive,0.7964,0.0,0.382,0.618
13919,@NYCTSubway Turning the AC off while we sit he...,turning ac sit nice touch .,positive,positive,positive,all_positive,0.4215,0.0,0.588,0.412
22662,@NYCTSubway Yeah well at least it’s a brand sp...,yeah well least ’ brand spanking new train . 😎,positive,positive,positive,all_positive,0.5106,0.0,0.538,0.462
9678,@NYCTSubway Yes I caught the 955pm train,yes caught 955pm train,positive,positive,positive,all_positive,0.4019,0.0,0.526,0.474


In [157]:
# let's check the scores for the all positive and all negative
predict_df.loc[predict_df.do_they_agree=='all_negative'].sample(10)

Unnamed: 0,tweet_text,predict_this,log_predictions,mnb_predictions,sgd_predictions,do_they_agree,compound_score,neg_score,neu_score,pos_score
24038,"@NYCTSubway #2045 on 6 line , no ac!","#2045 6 line , ac !",negative,negative,negative,all_negative,0.0,0.0,1.0,0.0
5814,@NYCTSubway It is now Monday and the countdown...,monday countdown clock lorimer st still fixed ...,negative,negative,negative,all_negative,0.0,0.0,1.0,0.0
9194,@NYCTSubway If I miss my R train cause of your...,miss train cause borderline retarded dispatche...,negative,negative,negative,all_negative,-0.8658,0.548,0.452,0.0
25982,@NYCTSubway whats this crap posted all over R ...,whats crap posted train ( 5725 ) https://t.co/...,negative,negative,negative,all_negative,-0.3818,0.342,0.658,0.0
224,@NYCTSubway This account is always saying the ...,account always saying wrong thing,negative,negative,negative,all_negative,-0.4767,0.437,0.563,0.0
13757,"@NYCTSubway Bullshit, so there’s hundreds of p...","bullshit , ’ hundreds people get fucked miss s...",negative,negative,negative,all_negative,-0.9001,0.706,0.294,0.0
28328,@NYCTSubway 47-50 Rockefeller Center. I've bee...,47-50 rockefeller center . i've waiting 5 minu...,negative,negative,negative,all_negative,0.0,0.0,1.0,0.0
20419,@NYCTSubway what’s going on with the n/b 1 tra...,’ going n / b 1 trains ? ? ? waiting 50th stre...,negative,negative,negative,all_negative,-0.5632,0.267,0.733,0.0
8765,@NYCTSubway July - seriously!? I mean people I...,july - seriously ! ? mean people . city need j...,negative,negative,negative,all_negative,-0.2481,0.166,0.834,0.0
13108,@NYCTSubway what's wrong with Astoria-Ditmars ...,what's wrong astoria-ditmars n train ? waiting...,negative,negative,negative,all_negative,-0.4767,0.279,0.721,0.0


In [158]:
predict_df.loc[(predict_df.compound_score>=0.75)&(predict_df.do_they_agree=='all_negative')].sample(10)

Unnamed: 0,tweet_text,predict_this,log_predictions,mnb_predictions,sgd_predictions,do_they_agree,compound_score,neg_score,neu_score,pos_score
16212,@NYCTSubway not really sure why the website sa...,really sure website says 7 trains running good...,negative,negative,negative,all_negative,0.8016,0.0,0.423,0.577
24207,@NYCTSubway Hope I don’t miss the transfer at ...,hope ’ miss transfer church since ’ running 10...,negative,negative,negative,all_negative,0.7506,0.091,0.511,0.398
23732,@NYCTSubway Please make the early Q morning co...,please make early q morning commute cars comfo...,negative,negative,negative,all_negative,0.7579,0.0,0.545,0.455
9681,@NYCTSubway All clear. Please keep an eye out ...,clear . please keep eye coordinate precinct . ...,negative,negative,negative,all_negative,0.7717,0.0,0.476,0.524
16226,@NYCTSubway Seems like your “overnight planned...,seems like “ overnight planned work ” seeped o...,negative,negative,negative,all_negative,0.7717,0.0,0.539,0.461
26245,@NYCTSubway @JimmyVanBramer I mean okay but li...,mean okay like someday someone . that's really...,negative,negative,negative,all_negative,0.7645,0.0,0.441,0.559
25510,@NYCTSubway @KFILE Amazing that the 'left' doe...,amazing ' left ' understand ' right ' live lik...,negative,negative,negative,all_negative,0.765,0.0,0.397,0.603
18093,@NYCTSubway Great F'ing job! No announcements...,"great f'ing job ! announcements , one platform...",negative,negative,negative,all_negative,0.8016,0.0,0.556,0.444
7376,@NYCTSubway Amazing how you guys keep wanting ...,amazing guys keep wanting raise fare still can...,negative,negative,negative,all_negative,0.7581,0.0,0.606,0.394
27819,@NYCTSubway why not permanently extend the M t...,permanently extend train 96th ? seems pretty p...,negative,negative,negative,all_negative,0.7717,0.0,0.342,0.658


In [200]:
predict_df.loc[(predict_df.compound_score<=-0.75)&(predict_df.do_they_agree=='all_negative')].sample(10)

Unnamed: 0,tweet_text,predict_this,log_predictions,mnb_predictions,sgd_predictions,do_they_agree,compound_score,neg_score,neu_score,pos_score
1077,@NYCTSubway You should fix these shit on the w...,fix shit weekend . u ’ fuck ur clients .,negative,negative,negative,all_negative,-0.7964,0.64,0.36,0.0
26580,@NYCTSubway You offer us such crappy service o...,"offer us crappy service , outdated trains , wo...",negative,negative,negative,all_negative,-0.8271,0.435,0.565,0.0
3279,@NYCTSubway wtf is up with this C train tho 😭 ...,wtf c train tho 😭 moving damn slow,negative,negative,negative,all_negative,-0.7579,0.619,0.381,0.0
26974,@NYCTSubway Stop pawning 7 customers off on th...,stop pawning 7 customers fucking lirr assholes...,negative,negative,negative,all_negative,-0.9052,0.622,0.28,0.098
28175,@NYCTSubway you stupid jackasses are making me...,stupid jackasses making late jury duty,negative,negative,negative,all_negative,-0.802,0.643,0.357,0.0
17865,@NYCTSubway Any update on person hit by a t...,update person hit train ? injured / dead ?,negative,negative,negative,all_negative,-0.8105,0.648,0.352,0.0
4748,@NYCTSubway why did a uptown 2 train just bypa...,uptown 2 train bypass wall street stop passeng...,negative,negative,negative,all_negative,-0.8453,0.537,0.463,0.0
30048,@NYCTSubway Q 8971 awful air conditioning. War...,q 8971 awful air conditioning . warm uncomfort...,negative,negative,negative,all_negative,-0.829,0.448,0.464,0.088
21443,@NYCTSubway @TheRealBSKay This is incredibly d...,"incredibly dangerous , what's wrong people ?",negative,negative,negative,all_negative,-0.7755,0.693,0.307,0.0
27282,@NYCTSubway Unbelievable! I missed 2 whole G t...,unbelievable ! missed 2 whole g train connecti...,negative,negative,negative,all_negative,-0.7825,0.526,0.364,0.109


In [205]:
predict_df.tweet_text[17865]

'@NYCTSubway    Any update on person hit by a train? Injured/dead?'

# So here's what I'm thinking...

It's clear that if the models agree that there are times when a tweet is postive or negative, but the NLTK VADER score doesn't actually score it as too pos/neg.

I think if we've determined is a response is positive/negative and exceeds a certain threshold (under -.75 to over .5 ... maybe we want to be more liberal with the compliments rather than the insults), then we will decide to send a reply. Either thank the user for their kindness, or tell them to stop being mean.

Let's apply the model to the entire evaluation dataset

In [207]:
print(predict_df.shape)
predict_df.to_csv('../data/predict_df_oct6.csv', index=False)

(30219, 10)
