## Applying Machine Learning To a Facebook Sentiment Analysis

### Obtaining the NH Brewery Facebook post data set

In [27]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt

#### Reading the CVS file

In [28]:
df = pd.read_csv('data.csv', encoding='utf-8')
df.head(5)

Unnamed: 0,message,sentiment,interactionRate
0,Nantucket Dubbed ‘Best Island In The World’ By...,1,9.8888
1,We will be pouring at this fine event this Sat...,1,0.5203
2,We will be pouring at this fine event this Sat...,0,0.2081
3,We will be pouring at this fine event this Sat...,0,0.0
4,We're brewing 155 gallons of coffee for our co...,0,0.1635


The attributes have the following meaning:
* **message**: the Facebook post posted by a business in the NH beer industry.
* **sentiment**: looking at if a post is of high engagement or low engagement, 0 is low, 1 is high.
* **interactionRate**: uses the equation ((likes+comments+shares)/(# of posts))*(1/(# of fans))
* **pageName**: business name

#### Understanding at the data, looking for null values and value counts

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7737 entries, 0 to 7736
Data columns (total 3 columns):
message            6736 non-null object
sentiment          7737 non-null int64
interactionRate    7733 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 181.4+ KB


In [30]:
df.describe()

Unnamed: 0,sentiment,interactionRate
count,7737.0,7733.0
mean,0.327776,0.692752
std,0.469433,1.404854
min,0.0,0.0
25%,0.0,0.072
50%,0.0,0.2415
75%,1.0,0.702
max,1.0,40.4624


In [7]:
df["sentiment"].value_counts()

0    5201
1    2536
Name: sentiment, dtype: int64

### bag-of-words model

Transforming documents into feature vectors and assessing word relevancy via term frequency-inverse document frequency

In [8]:
vectorizer = CountVectorizer()
np.set_printoptions(precision=2)

In [9]:
vectorizer.fit_transform(df['message'].values.astype('U'))

<7737x14756 sparse matrix of type '<class 'numpy.int64'>'
	with 183162 stored elements in Compressed Sparse Row format>

In [10]:
tfidf = TfidfTransformer(use_idf=True, 
                         norm='l2', 
                         smooth_idf=True)

### Processing documents into tokens by whitespace

In [11]:
porter = PorterStemmer()
def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

#### Removing stop words

In [12]:
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bridgetfranciscovich/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Breaking the data into test and train data sets

In [13]:
X_train = df.iloc[:6000, 0].values
y_train = df.iloc[:6000, 1].values
X_test = df.iloc[6000:, 0].values
y_test = df.iloc[6000:, 1].values

### Training the model and creating a pipeline using Logistic Regression

In [12]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

In [13]:
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__stop_words': [stop, None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

In [14]:
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

In [15]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1)

#### Filling in the blank pieces training of data
Fixes ValueError: np.nan is an invalid document, expected byte or unicode string.

In [16]:
X_train, y_train = df.message.fillna(' '), df.sentiment

#### Fitting the model

In [17]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 10.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x111e047b8>, <function tokenizer_porter at 0x111e04730>], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yoursel... "won't", 'wouldn', "wouldn't"], None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [18]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',

In [19]:
clf = gs_lr_tfidf.best_estimator_

#### Filling in the blank pieces training of data

In [20]:
X_train, y_train = df.message.fillna(' '), df.sentiment

### Testing the model's accuracy

In [1]:
#clf.predict(X_test)

In [None]:
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

In [None]:
example = ['Nantucket', 'We are donating to the troops for every sale we make!', 'Try our new Sassion beer with an ABV 3.4%', 'Our new IPA is hoppy and on tap!']
clf.predict(example)

### Training the model and creating a pipeline using Stochastic Gradient Descent

In [14]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

In [15]:
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__stop_words': [stop, None]},
              ]

In [16]:
lr_tfidf = Pipeline([('vect', tfidf),
                    ('sgd', SGDClassifier(random_state=0))]) #, loss='log'

In [17]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1)

#### Filling in the blank pieces training of data
Fixes ValueError: np.nan is an invalid document, expected byte or unicode string.

In [24]:
X_train, y_train = df.message.fillna(' '), df.sentiment

#### Fitting the model

In [19]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  1.6min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...lty='l2', power_t=0.5, random_state=0, shuffle=True,
       tol=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x10a641a60>, <function tokenizer_porter at 0x10a738378>], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yoursel...', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"], None]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [20]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x10a641a60>} 
CV Accuracy: 0.708


In [21]:
clf = gs_lr_tfidf.best_estimator_

In [22]:
clf.predict(X_test)

ValueError: np.nan is an invalid document, expected byte or unicode string.

#### Filling in the blank pieces testing of data

In [None]:
X_test, y_test = df.message.fillna(' '), df.sentiment

### Testing the model's accuracy

In [None]:
clf.predict(X_test)

In [None]:
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

In [None]:
example = ['Nantucket', 'We are donating to the troops for every sale we make!', 'Try our new Sassion beer with an ABV 3.4%', 'Our new IPA is hoppy and on tap!']
clf.predict(example)

## Pickle the model

In [None]:
list_pickle_path = 'fb_pickle.pkl'

In [None]:
pickle.dump(clf, open(list_pickle_path, 'wb'))