In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 200)

In [2]:
DATA_DIR = 'data/sentiment_labelled_sentences/'

IMDB_DATA_FILE = DATA_DIR + 'imdb_labelled.txt'
YELP_DATA_FILE = DATA_DIR + 'yelp_labelled.txt'
AMAZON_DATA_FILE = DATA_DIR + 'amazon_cells_labelled.txt'

COLUMN_NAMES = ['Review', 'Sentiment']

## IMBD

In [3]:
imdb_reviews =  pd.read_table(IMDB_DATA_FILE, names=COLUMN_NAMES)

In [6]:
imdb_reviews.head(10)

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie about a distressed, drifting young man.",0
1,"Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.",0
2,"Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.",0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.,1
5,"The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.",0
6,Wasted two hours.,0
7,"Saw the movie today and thought it was a good effort, good messages for kids.",1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the science teacher.,1


In [8]:
imdb_reviews['Sentiment'].value_counts()

1    386
0    362
Name: Sentiment, dtype: int64

In [15]:
imdb_counts = pd.DataFrame(imdb_reviews['Sentiment'].value_counts())
imdb_counts.index = ['Postive','Negative']
imdb_counts

Unnamed: 0,Sentiment
Postive,386
Negative,362


## yelp

In [16]:
yelp_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)

yelp_reviews.head(10)

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1
4,The selection on the menu was great and so were the prices.,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.,0
8,The fries were great too.,1
9,A great touch.,1


In [18]:
yelp_counts = pd.DataFrame(yelp_reviews['Sentiment'].value_counts())
yelp_counts.index = ['Postive','Negative']
yelp_counts

Unnamed: 0,Sentiment
Postive,500
Negative,500


## Amazon 

In [20]:
amazon_reviews = pd.read_table(AMAZON_DATA_FILE, names=COLUMN_NAMES)

amazon_reviews.head(10)

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here in the US unless I go by a converter.,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,0
4,The mic is great.,1
5,I have to jiggle the plug to get it to line up right to get decent volume.,0
6,"If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.",0
7,If you are Razr owner...you must have this!,1
8,"Needless to say, I wasted my money.",0
9,What a waste of money and time!.,0


In [22]:
amazon_counts = pd.DataFrame(amazon_reviews['Sentiment'].value_counts())
amazon_counts.index = ['Positive','Negative']
amazon_counts

Unnamed: 0,Sentiment
Positive,500
Negative,500


## Exercise 65

In [23]:
#combining the three data frames
review_data = pd.concat([amazon_reviews, imdb_reviews, yelp_reviews])

In [24]:
print(review_data.shape)
review_data.head()

(2748, 2)


Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here in the US unless I go by a converter.,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,0
4,The mic is great.,1


In [25]:
review_data['Sentiment'].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [26]:
import re

def clean(text):
    text = re.sub(r'[\W]+', ' ', text.lower())
    text = text.replace('hadn t' , 'had not')\
               .replace('wasn t', 'was not')\
               .replace('didn t', 'did not')
    return text

In [27]:
review_model_data = review_data.copy()
review_model_data.Review = review_model_data.Review.apply(clean)

In [29]:
review_model_data.head()

Unnamed: 0,Review,Sentiment
0,so there is no way for me to plug it in here in the us unless i go by a converter,0
1,good case excellent value,1
2,great for the jawbone,1
3,tied to charger for conversations lasting more than 45 minutes major problems,0
4,the mic is great,1


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [31]:
tfidf = TfidfVectorizer(strip_accents=None,
                        preprocessor=None,
                        lowercase=False)
log_reg = LogisticRegression(random_state=0, solver='lbfgs')
log_tfidf = Pipeline([('vect', tfidf),
                       ('clf', log_reg)])

In [32]:
X_train, X_test, y_train, y_test = train_test_split(review_model_data.Review, 
                                                    review_model_data.Sentiment, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [33]:
log_tfidf.fit(X_train.values, y_train.values)

Pipeline(steps=[('vect', TfidfVectorizer(lowercase=False)),
                ('clf', LogisticRegression(random_state=0))])

In [34]:
test_accuracy = log_tfidf.score(X_test.values, y_test.values)
'The model has a test accuracy of {:.0%}'.format(test_accuracy)

'The model has a test accuracy of 81%'

In [35]:
log_tfidf.predict(['I loved this place', 'I hated this place'])

array([1, 0])

In [38]:
log_tfidf.predict(['I loved this place better than before', 'I hated this place but love it sometimes','I used to love this place, but really hate it now'])

array([1, 1, 1])