# Scoring Opinions and Sentiments

## Understanding How Machines Read

In [1]:
text_1 = 'The quick brown fox jumps over the lazy dog.'
text_2 = 'My dog is quick and can jump over fences.'
text_3 = 'Your dog is so lazy that it sleeps all the day.'
corpus = [text_1, text_2, text_3]

In [2]:
from sklearn.feature_extraction import text
vectorizer = text.CountVectorizer(binary=True).fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense())

[[0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 0 0 1 0]
 [0 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 0 0]
 [1 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 1]]


In [3]:
print(vectorizer.vocabulary_)

{'the': 19, 'quick': 15, 'brown': 2, 'fox': 7, 'jumps': 11, 'over': 14, 'lazy': 12, 'dog': 5, 'my': 13, 'is': 8, 'and': 1, 'can': 3, 'jump': 10, 'fences': 6, 'your': 20, 'so': 17, 'that': 18, 'it': 9, 'sleeps': 16, 'all': 0, 'day': 4}


## Processing and Enhancing Text

In [4]:
text_4 = 'A black dog just passed by but my dog is brown.'
corpus.append(text_4)
vectorizer = text.CountVectorizer().fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense()[-1])

[[0 0 1 1 1 1 0 0 2 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0]]


In [5]:
TfidF = text.TfidfTransformer(norm='l1')
tfidf = TfidF.fit_transform(vectorized_text)

phrase = 3 # choose a number from 0 to 3
total = 0
for word in vectorizer.vocabulary_:
    pos = vectorizer.vocabulary_[word]
    value = list(tfidf.toarray()[phrase])[pos]
    if value !=0:
        print ("%10s: %0.3f" % (word, value))
        total += value
print ('\nSummed values of a phrase: %0.1f' % total)

     brown: 0.095
       dog: 0.126
        my: 0.095
        is: 0.077
     black: 0.121
      just: 0.121
    passed: 0.121
        by: 0.121
       but: 0.121

Summed values of a phrase: 1.0


In [6]:
bigrams = text.CountVectorizer(ngram_range=(2,2))
print (bigrams.fit(corpus).vocabulary_)

{'the quick': 30, 'quick brown': 24, 'brown fox': 3, 'fox jumps': 9, 'jumps over': 15, 'over the': 21, 'the lazy': 29, 'lazy dog': 17, 'my dog': 19, 'dog is': 7, 'is quick': 11, 'quick and': 23, 'and can': 1, 'can jump': 6, 'jump over': 14, 'over fences': 20, 'your dog': 31, 'is so': 12, 'so lazy': 26, 'lazy that': 18, 'that it': 27, 'it sleeps': 13, 'sleeps all': 25, 'all the': 0, 'the day': 28, 'black dog': 2, 'dog just': 8, 'just passed': 16, 'passed by': 22, 'by but': 5, 'but my': 4, 'is brown': 10}


## Stemming and removing stop words

In [7]:
from sklearn.feature_extraction import text

import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')

stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

vocab = ['Sam loves swimming so he swims all the time']
vect = text.CountVectorizer(tokenizer=tokenize, 
                           stop_words='english')
vec = vect.fit(vocab)

sentence1 = vec.transform(['George loves swimming too!'])

print (vec.get_feature_names())
print (sentence1.toarray())

[nltk_data] Downloading package punkt to /Users/Amigo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['love', 'sam', 'swim', 'time']
[[1 0 1 0]]


## Scraping Textual Datasets from the Web

In [8]:
from bs4 import BeautifulSoup
import pandas as pd
try:
    import urllib2 # Python 2.7.x
except:
    import urllib.request as urllib2 # Python 3.x

wiki = "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population"
header = {'User-Agent': 'Mozilla/5.0'} 
query = urllib2.Request(wiki, headers=header)
page = urllib2.urlopen(query)
soup = BeautifulSoup(page, "lxml")

In [9]:
table = soup.find("table", { "class" : "wikitable sortable" })
final_table = list()
for row in table.findAll('tr'):
    cells = row.findAll("td")
    if len(cells) >=6:
        v1 = cells[1].find(text=True)
        v2 = cells[2].find(text=True)
        v3 = cells[3].find(text=True)
        v4 = cells[4].find(text=True)
        v5 = cells[6].findAll(text=True)
        #v5 = v5[2].split()[0]
        final_table.append([v1, v2, v3, v4, v5])
cols = ['City','State','Population_2014','Census_2010'
        ,'Land_Area_Mile2']
df = pd.DataFrame(final_table, columns=cols)

In [10]:
df

Unnamed: 0,City,State,Population_2014,Census_2010,Land_Area_Mile2
0,New York,,8622698,8175133,[301.5 sq mi ]
1,Los Angeles,,3999759,3792621,[468.7 sq mi ]
2,Chicago,,2716450,2695598,[227.3 sq mi ]
3,Houston,,2312717,2100263,[637.5 sq mi ]
4,Phoenix,,1626078,1445632,[517.6 sq mi ]
5,Philadelphia,,1580863,1526006,[134.2 sq mi ]
6,San Antonio,,1511946,1327407,[461.0 sq mi ]
7,San Diego,,1419516,1307402,[325.2 sq mi ]
8,Dallas,,1341075,1197816,[340.9 sq mi ]
9,San Jose,,1035317,945942,[177.5 sq mi ]


## Using Scoring and Classification

In [11]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, 
    categories = ['misc.forsale'],
     remove=('headers', 'footers', 'quotes'), random_state=101)
print ('Posts: %i' % len(dataset.data))

Posts: 585


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.95, 
            min_df=2, stop_words='english')
tfidf = vectorizer.fit_transform(dataset.data)
from sklearn.decomposition import NMF
n_topics = 5
nmf = NMF(n_components=n_topics, random_state=101).fit(tfidf)

In [13]:
feature_names = vectorizer.get_feature_names()
n_top_words = 15
for topic_idx, topic in enumerate(nmf.components_):
   print ("Topic #%d:" % (topic_idx+1),)
   print (" ".join([feature_names[i] for i in 
                    topic.argsort()[:-n_top_words - 1:-1]]))

Topic #1:
condition excellent asking offer best car old new sale 10 miles 000 tape cd power
Topic #2:
00 50 dos 20 10 15 cover 1st new 25 price man 40 shipping comics
Topic #3:
drive hard card floppy monitor meg ram disk motherboard vga modem brand scsi color internal
Topic #4:
email looking game games send interested mail thanks like edu good want package price list
Topic #5:
shipping vcr works stereo obo included amp plus great volume unc mathes gibbs radley remotes


In [14]:
print (nmf.components_[0,:].argsort()[:-n_top_words-1:-1]) 
# Gets top words for topic 0

[1075 1459  632 2463  740  888 2476 2415 2987   10 2305    1 3349  923
 2680]


In [15]:
print (vectorizer.get_feature_names()[1075]) 
# Transforms index 1075 back to text 

condition


## Analyzing reviews from e-commerce

In [16]:
try:
    import urllib2 # Python 2.7.x
except:
    import urllib.request as urllib2 # Python 3.x
import requests, io, os, zipfile

UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip'
response = requests.get(UCI_url)
compressed_file = io.BytesIO(response.content)
z = zipfile.ZipFile(compressed_file)
print ('Extracting in %s' %  os.getcwd())
for name in z.namelist():
    filename = name.split('/')[-1]
    nameOK = ('MACOSX' not in name and '.DS' not in name)
    if filename and nameOK:
            newfile = os.path.join(os.getcwd(), 
                                   os.path.basename(filename))
            with open(newfile, 'wb') as f:
                f.write(z.read(name))
            print ('\tunzipping %s' % newfile)

Extracting in /Users/Amigo/anaconda3/anyaconda3/myGitrepo/ML_lib/myML4D
	unzipping /Users/Amigo/anaconda3/anyaconda3/myGitrepo/ML_lib/myML4D/amazon_cells_labelled.txt
	unzipping /Users/Amigo/anaconda3/anyaconda3/myGitrepo/ML_lib/myML4D/imdb_labelled.txt
	unzipping /Users/Amigo/anaconda3/anyaconda3/myGitrepo/ML_lib/myML4D/readme.txt
	unzipping /Users/Amigo/anaconda3/anyaconda3/myGitrepo/ML_lib/myML4D/yelp_labelled.txt


In [17]:
import numpy as np
import pandas as pd
dataset = 'imdb_labelled.txt'
data = pd.read_csv(dataset, header=None, sep=r"\t", engine='python')
data.columns = ['review','sentiment']

In [18]:
data.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [19]:
from sklearn.cross_validation import train_test_split
corpus, test_corpus, y, yt = train_test_split(data.ix[:,0], data.ix[:,1], test_size=0.25, random_state=101)

In [20]:
from sklearn.feature_extraction import text
vectorizer = text.CountVectorizer(ngram_range=(1,2), 
                    stop_words='english').fit(corpus)
TfidF = text.TfidfTransformer()
X = TfidF.fit_transform(vectorizer.transform(corpus))
Xt = TfidF.transform(vectorizer.transform(test_corpus))

In [21]:
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
param_grid = {'C': [0.01, 0.1, 1.0, 10.0, 100.0]}
clf = GridSearchCV(LinearSVC(loss='hinge', 
                    random_state=101), param_grid)
clf = clf.fit(X, y)
print ("Best parameters: %s" % clf.best_params_)

Best parameters: {'C': 1.0}


In [22]:
from sklearn.metrics import accuracy_score
solution = clf.predict(Xt)
print("Achieved accuracy: %0.3f" % 
      accuracy_score(yt, solution))

Achieved accuracy: 0.816


In [23]:
print(test_corpus[yt!=solution])

601    There is simply no excuse for something this p...
32     This is the kind of money that is wasted prope...
887    At any rate this film stinks, its not funny, a...
668    Speaking of the music, it is unbearably predic...
408         It really created a unique feeling though.  
413         The camera really likes her in this movie.  
138    I saw "Mirrormask" last night and it was an un...
132    This was a poor remake of "My Best Friends Wed...
291                               Rating: 1 out of 10.  
904    I'm so sorry but I really can't recommend it t...
410    A world better than 95% of the garbage in the ...
55     But I recommend waiting for their future effor...
826    The film deserves strong kudos for taking this...
100            I don't think you will be disappointed.  
352                                    It is shameful.  
171    This movie now joins Revenge of the Boogeyman ...
814    You share General Loewenhielm's exquisite joy ...
218    It's this pandering to t