# Sentiment Analysis with Python

# What is sentiment analysis ?

Briefly: recognize the "feeling" of some text, happy, sad, pos, neg, etc.

In [None]:
# Download data
# NLTK Corpora Twitter Samples
# http://www.nltk.org/nltk_data/
import requests, zipfile, io

url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/twitter_samples.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

In [1]:
import os
import json
import csv

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pickle


curr_dir = os.getcwd()

classes = ['pos', 'neg']

_data = []
_labels = []
for line in open (os.path.join(curr_dir ,"twitter_samples/") + r'positive_tweets.json', 'r'):
    _data.append(json.loads(line)['text'])
    _labels.append("pos")


for line in open (os.path.join(curr_dir ,"twitter_samples/") + r'negative_tweets.json', 'r'):
    _data.append(json.loads(line)['text'])
    _labels.append("neg")

In [2]:
#### Create feature vectors
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, stop_words='english', use_idf=True)

data_vectors = vectorizer.fit_transform(_data)

cv=KFold(data_vectors.shape[0], n_folds=10, shuffle=True, random_state=1)

# Perform classification with MultinomialNB
clf = MultinomialNB()

scores = cross_validation.cross_val_score(clf, data_vectors, _labels, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(data_vectors, _labels)

Accuracy: 0.75 (+/- 0.01)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [3]:
with open('models/clf.pkl', 'wb') as fmodel:
    pickle.dump(clf, fmodel)
with open('models/vocabulary.pkl', 'wb') as fvocabulary:
    pickle.dump(vectorizer.vocabulary_, fvocabulary)

In [4]:
with open('models/clf.pkl', 'rb') as fmodel:
    clf = pickle.load(fmodel)
with open('models/vocabulary.pkl', 'rb') as fvocabulary:
    vocabulary = pickle.load(fvocabulary)

Get your keys here:
https://apps.twitter.com/

In [5]:
import tweepy
from tweepy import OAuthHandler
 
consumer_key = 'hcBCfwYRHQScLlBjxoIl0cnvV'
consumer_secret = 'MoxZcW1x52Fu03W5Z54sI4FsVQiR9zoO4aPCwQvUjbqVBnM9uk'
access_token = '29218210-l2zElQml2kQOywcDphTKNL8Q83N3yMXotrij6G9sx'
access_secret = 'C2qIK4axjucElWaoo67g5Dke7AXelbkgcS5SsfaZyDxfk'
 
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
 
api = tweepy.API(auth)

In [6]:
with open('tweets.csv', 'w') as csvfile:
    tweet_writer = csv.writer(csvfile)
    for tweet in tweepy.Cursor(api.search, q='trump', languages=["en"]).items(50):
        tweet_writer.writerow([tweet.text.encode('utf-8')])

In [7]:
tweets = []
with open('tweets.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        tweets.append(row[0])

In [16]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, stop_words='english', use_idf=True, vocabulary=vocabulary)
tweet_vectors = vectorizer.fit_transform(tweets)

In [17]:
pred_multinomial = clf.predict(tweet_vectors)
prob_multinomial = clf.predict_proba(tweet_vectors)

In [10]:
sentiment_tweets = []
for index, prob in enumerate(prob_multinomial):
    sentiment_tweets.append({"Tweet": tweets[index], "p_neg": prob[0], "p_pos": prob[1], "target": pred_multinomial[index]})

In [11]:
with open('tweet_sentiments.json', 'w') as jsonfile:
    json.dump(sentiment_tweets, jsonfile, sort_keys=True, indent=4, ensure_ascii=True)

In [12]:
sentiment_tweets[:5]

[{'Tweet': 'https://t.co/u5Fdl5h59i',
  'p_neg': 0.38730505494998374,
  'p_pos': 0.61269494505001587,
  'target': 'pos'},
 {'Tweet': '@AboveUp I wondered about that one time Trump said something about international banks having caused problems and he was called anti-semitic',
  'p_neg': 0.47827424887042869,
  'p_pos': 0.52172575112957176,
  'target': 'pos'},
 {'Tweet': "RT @tyleroakley: imagine being so stubborn that even with this many DAILY offensive &amp; appalling trump stories, you still won't jump ship. y\xe2\x80\xa6",
  'p_neg': 0.43561675645564979,
  'p_pos': 0.56438324354435132,
  'target': 'pos'},
 {'Tweet': "RT @LRB: 'I will absolutely apologise sometime in the hopefully distant future if I am ever wrong\xe2\x80\x99 &amp; other Trump sayings https://t.co/hIhfC\xe2\x80\xa6",
  'p_neg': 0.45190655740122976,
  'p_pos': 0.54809344259876891,
  'target': 'pos'},
 {'Tweet': 'RT @CNN: Arianne Zucker on Trump: "Be careful what you say, because the repercussions will come back" https: