In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
import time
import glob
import datetime
import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score



In [2]:
data = pd.read_csv('../Tweets_sent_from_any_user_tagging_the__NotifyBoston_account_in_the_tweet_from_March_2014_-_March_2015.csv')

In [3]:
data.head()

Unnamed: 0,Account Mentioned,User Name,Name,Update,Link,Location,Time,City,State/region,Country,Metro,Latitude,Longitude
0,@notifyboston,NotifyBoston,City of Boston,Thanks! RT @WMcAuliffe13: Hydrants have sussec...,http://twitter.com/NotifyBoston/statuses/56015...,"Boston, MA",01/27/2015 11:04:00 AM,Boston,MA,UNITED STATES,Boston-Cambridge-Quincy MA-NH,42.3733,-71.0155
1,@notifyboston,NotifyBoston,City of Boston,Looking good! Thank you. RT @courtwilliams92: ...,http://twitter.com/NotifyBoston/statuses/56015...,"Boston, MA",01/27/2015 11:19:00 AM,Boston,MA,UNITED STATES,Boston-Cambridge-Quincy MA-NH,42.3733,-71.0155
2,@notifyboston,Osmanbiyik,Osman biyik,I'm at City of Boston - @notifyboston w/ @dogu...,http://twitter.com/Osmanbiyik/statuses/4396788...,,03/01/2014 12:29:00 AM,Boston,MA,UNITED STATES,Boston-Cambridge-Quincy MA-NH,42.3733,-71.0155
3,@notifyboston,JuliaLamar_,Julia Lamar,My view of #Boston last night from dinner at t...,http://twitter.com/JuliaLamar_/statuses/439762...,Pittsburgh PA - Providence RI,03/01/2014 06:01:00 AM,,,,,,
4,@notifyboston,designmuseumbos,Design Museum Boston,Do you want to see your poster around the City...,http://twitter.com/designmuseumbos/statuses/43...,"Boston, MA",03/01/2014 08:05:00 AM,Boston,MA,UNITED STATES,Boston-Cambridge-Quincy MA-NH,42.3733,-71.0155


In [4]:
data['year'] = data['Time'].str[6:10].astype(str)
data['month'] = data['Time'].str[:2].astype(str)

In [7]:
year_2015 = data[(data.year == '2015')]

In [8]:
year_2015.head()

Unnamed: 0,Account Mentioned,User Name,Name,Update,Link,Location,Time,City,State/region,Country,Metro,Latitude,Longitude,year,month
0,@notifyboston,NotifyBoston,City of Boston,Thanks! RT @WMcAuliffe13: Hydrants have sussec...,http://twitter.com/NotifyBoston/statuses/56015...,"Boston, MA",01/27/2015 11:04:00 AM,Boston,MA,UNITED STATES,Boston-Cambridge-Quincy MA-NH,42.3733,-71.0155,2015,1
1,@notifyboston,NotifyBoston,City of Boston,Looking good! Thank you. RT @courtwilliams92: ...,http://twitter.com/NotifyBoston/statuses/56015...,"Boston, MA",01/27/2015 11:19:00 AM,Boston,MA,UNITED STATES,Boston-Cambridge-Quincy MA-NH,42.3733,-71.0155,2015,1
706,@notifyboston,drblalock,Hawkeye,@twright55 @NYC @NotifyBoston You too!,http://twitter.com/drblalock/statuses/55977243...,Eastern Virginia,01/26/2015 09:58:00 AM,,VA,UNITED STATES,,,,2015,1
871,@notifyboston,muchsmallfails,mw,@NotifyBoston @marty_walsh thank you DPW & tha...,http://twitter.com/muchsmallfails/statuses/561...,,01/30/2015 08:37:00 AM,,,,,,,2015,1
1015,@notifyboston,allcorgis,Allyson Baughman,@marty_walsh @NotifyBoston Thank you!,http://twitter.com/allcorgis/statuses/56703970...,,02/15/2015 11:16:00 AM,,,,,,,2015,2


In [9]:
year_2015.month.unique()

array(['01', '02', '03'], dtype=object)

In [10]:
year_2015 = year_2015[(year_2015.month == '01') | (year_2015.month == '02')]

## Random Forest: What differentiates tweets from Feb from Jan 2015? (Bigrams!)

In [11]:
year_2015['FEB'] = year_2015['month'].str.contains('02')

In [13]:
texts = year_2015['Update'].fillna('')

vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[2, 2], 
                             stop_words='english',
                             binary=False)

model = RandomForestClassifier(n_estimators = 50)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = year_2015['FEB']

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.86959564  0.94081793  0.66511658], Average AUC 0.825176716277


In [14]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(50)

                      Features  Importance Score
797                  snow home          0.030735
202           cityofboston gov          0.025200
348                   gov snow          0.023675
895                tomorrow 29          0.016412
21                   29 bosnow          0.015397
359                   home asp          0.014437
385                  http park          0.013462
59             911 emergencies          0.013323
799                  snow http          0.012733
743            rt notifyboston          0.012625
379          http cityofboston          0.012391
391                   http www          0.012160
645                park boston          0.009691
556        notifyboston boston          0.007399
663                pay parking          0.006980
603          notifyboston snow          0.006979
807               snow related          0.006635
138                bosnow http          0.006290
314             expected later          0.006220
808               sn

## How about trigrams?

In [15]:
vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[3, 3], 
                             stop_words='english',
                             binary=False)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = year_2015['FEB']

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.85330684  0.91212291  0.64004568], Average AUC 0.801825142804


In [16]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(50)

                            Features  Importance Score
313                    gov snow home          0.038456
815                    snow home asp          0.029605
186            cityofboston gov snow          0.025428
353                 http park boston          0.019254
348            http cityofboston gov          0.018239
314                    gov snow http          0.017595
19                   29 bosnow alert          0.017436
993             www cityofboston gov          0.017283
614                  park boston gov          0.013670
195               closed tomorrow 29          0.012663
713           rt notifyboston boston          0.011995
329         homeless 911 emergencies          0.011100
716             rt notifyboston city          0.010939
362            http www cityofboston          0.010705
893               tomorrow 29 bosnow          0.009490
818                    snow http www          0.008049
755         rt notifyboston tomorrow          0.008036
945       