# Twitter API

## Lecture Notes

##### Why Twitter ?


"While physics and math may tell us how the universe began, they are not much use in predicting Human Behavior because there are far too many Equations to Solve" 
    
    -Stephan Hawking
    

#### Rest API vs Streaming API: 


REST:  
    - Query user accounts using OAuth
    - Allows you to access 'historical' tweets

STREAM: 
    - Essentially long-running request (left Open) using OAuth
    - Access realtime stream of data
       

### Twitter API Documentation

https://dev.twitter.com/

#### Rest API

In [None]:
from __future__ import print_function, division

In [None]:
#pip install requests_oauthlib

In [346]:
import requests
from requests_oauthlib import OAuth1
import pandas as pd

#OAuth ~ simple way to to publish & interact with data

In [347]:
# Retrieve the home directory so we can find our .twitter_config
from os.path import expanduser
home = expanduser("~")
home

'/Users/ash'

In [348]:
# Importing our Config

#pip install cnfg

import cnfg
config = cnfg.load(home + "/.twitter_config")

oauth = OAuth1(config["consumer_key"],
               config["consumer_secret"],
               config["access_token"],
               config["access_token_secret"])

In [None]:
response = requests.get("https://api.twitter.com/1.1/statuses/user_timeline.json",
                        auth=oauth)

tweets = response.json()

In [None]:
for tweet in tweets:
    print(tweet['text'], "\n")

In [None]:
for key in tweets[0].keys():
    print(key)

In [None]:
parameters = {"q": "oscars", "count":20}
response = requests.get("https://api.twitter.com/1.1/search/tweets.json",
                        params = parameters,
                        auth=oauth)

from pprint import pprint
pprint(response.json()['search_metadata'])

In [None]:
tweets = response.json()['statuses']

print('PAGE 1')
for tweet in tweets:
    print(tweet['id'], tweet['text'], '\n')

#### STREAMING API ~ [TWEEPY](http://www.tweepy.org/)

In [349]:
# pip install tweepy
import tweepy

auth = tweepy.OAuthHandler(config["consumer_key"],
                           config["consumer_secret"])
auth.set_access_token(config["access_token"],
                      config["access_token_secret"])

api=tweepy.API(auth)

In [350]:
print(api)

<tweepy.api.API object at 0x11cebb910>


In [336]:
tweepyCurs=tweepy.Cursor(api.search, q='#tsla, $tsla, tsla')

In [351]:
from pprint import pprint

In [338]:

max_tweets=1

#Tweepy Cursor handles pagination .. 

for tweet in tweepyCurs.items(max_tweets):
    pprint(tweet)

TweepError: Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Max retries exceeded with url: /1.1/search/tweets.json?q=%23tsla%2C+%24tsla%2C+tsla (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x116d0b8d0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))

In [352]:
results=[]


for tweet in tweepy.Cursor(api.search,q="#aapl,$aapl,apple").items(100):
    results.append(tweet)

In [394]:
results[0]._json['user']['friends_count']

1505

In [367]:
print results[0]

Status(contributors=None, truncated=False, text=u'SPY ETF/Apple Shares: A Tale of Two Wedges? $SPY, $AAPL  #Trading #investing #aapl #SP500 https://t.co/ggzsxwZGRy https://t.co/WcPuhek7NM', is_quote_status=False, in_reply_to_status_id=None, id=737295278907084801, favorite_count=0, _api=<tweepy.api.API object at 0x11cebb910>, author=User(follow_request_sent=False, has_extended_profile=False, profile_use_background_image=True, _json={u'follow_request_sent': False, u'has_extended_profile': False, u'profile_use_background_image': True, u'default_profile_image': False, u'id': 950517078, u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme9/bg.gif', u'verified': False, u'profile_text_color': u'666666', u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/3203793810/b1e8bf3f4ac0c7c2da3ca5f8a90c605d_normal.jpeg', u'profile_sidebar_fill_color': u'252429', u'entities': {u'url': {u'urls': [{u'url': u'http://t.co/G5nJlgLoQ4', u'indices': [0, 22], u'expan

In [405]:
results[0]._json

{u'contributors': None,
 u'coordinates': None,
 u'created_at': u'Mon May 30 14:51:07 +0000 2016',
 u'entities': {u'hashtags': [{u'indices': [57, 65], u'text': u'Trading'},
   {u'indices': [66, 76], u'text': u'investing'},
   {u'indices': [77, 82], u'text': u'aapl'},
   {u'indices': [83, 89], u'text': u'SP500'}],
  u'media': [{u'display_url': u'pic.twitter.com/WcPuhek7NM',
    u'expanded_url': u'http://twitter.com/JPDesloges/status/737295278907084801/photo/1',
    u'id': 737280316096847873,
    u'id_str': u'737280316096847873',
    u'indices': [114, 137],
    u'media_url': u'http://pbs.twimg.com/media/CjtYiozXIAErLXd.jpg',
    u'media_url_https': u'https://pbs.twimg.com/media/CjtYiozXIAErLXd.jpg',
    u'sizes': {u'large': {u'h': 499, u'resize': u'fit', u'w': 998},
     u'medium': {u'h': 300, u'resize': u'fit', u'w': 600},
     u'small': {u'h': 170, u'resize': u'fit', u'w': 340},
     u'thumb': {u'h': 150, u'resize': u'crop', u'w': 150}},
    u'type': u'photo',
    u'url': u'https://t.co

#### Import tweets into Pandas

In [None]:
import pandas as pd

In [None]:
def structure_results(results):
    id_list=[tweet.id for tweet in results]
    data=pd.DataFrame(id_list,columns=['id'])
    
    data["text"]= [tweet.text.encode('utf-8') for tweet in results]
    data["datetime"]=[tweet.created_at for tweet in results]
    data["Location"]=[tweet.place for tweet in results]
    
    return data

In [None]:
data=structure_results(results)
data.head()

#### Import Tweets into MongoDB

In [327]:
# conda install pymongo
import json
from pymongo import MongoClient


client = MongoClient()
db = client.example
tweets = db.tweets

In [343]:
for tweet in results:
    data={}
    data['q_string']='query string'
    data['tweet']=tweet.text.encode('utf-8') 
    data['datetime']=tweet.created_at
    data['is_favorite']=tweet._json['favorited']
    data['favorite_count']=tweet._json['favorite_count']
    data['lang']=tweet._json['lang']
    data['is_retweet']=tweet._json['retweeted']
    data['retweet_count']=tweet._json['retweet_count']
    tweets.insert_one(data)

In [341]:
tweets.find_one()

KeyboardInterrupt: 

In [4]:
import sourceData

In [5]:
api=sourceData.connect_twitter()

In [118]:
df = pd.DataFrame()

In [119]:
df=sourceData.get_tweets(api,'$aapl,#aapl,apple,aapl')

In [120]:
df.text[1]

"Apple's outperform rating reiterated at RBC Capital. $120.00 PT. https://t.co/qAGCm2AW3p $AAPL #AAPL"

In [18]:
df.text[5]

'RT @RatingsNetwork: Apple given $120.00 PT by Morgan Stanley. buy rating. https://t.co/19JtEb809i $AAPL #AAPL'

In [26]:
df.text[35]

'$AAPL - Apple Inc. #AAPL Stake Increased by BB&amp;T Securities LLC https://t.co/FtK2eciszL'

In [112]:
data=df
import unicodedata as uc

In [38]:
string='this ˆøis ª•∆˙ funky string'
string1='normal string'

In [43]:
unicode(string,'utf-8')

u'this \u02c6\xf8is \xaa\u2022\u2206\u02d9 funky string'

In [58]:
uc.normalize('NFKD',unicode(data.text[60],'utf-8'))

u'Apple Inc. Reiterated Rating by Canaccord Genuity (Ranked 58th) to Buy with 130 PT $AAPL #AAPL https://t.co/DWfFaFCwq5'

In [113]:
data.text.map(lambda val: uc.normalize('NFD',unicode(val,'utf-8')))

0      inc reiterated rating by rbc capital mkts ran...
1      bulls still pushing their luck as above daily...
2      bulls pushing their luck as above the daily r...
3     rt ratingsnetwork  given pt by morgan stanley ...
4     rt ratingsnetwork  given pt by morgan stanley ...
5     rt ratingsnetwork  given pt by morgan stanley ...
6      inc set price target by morgan stanley ranked...
7      given pt by morgan stanley buy rating https c...
8      given pt by morgan stanley buy rating https c...
9       short term elliott wave analysis https co mm...
10     buy rating reiterated at bmo capital markets ...
11     inc reiterated rating by bmo capital markets ...
12     inc upgrade by pacific crest securities ranke...
13     buy rating reiterated at bmo capital markets ...
14       inc is below its upper bollinger band of ht...
15       inc is at below its chandelier exit of http...
16    rt ratingsnetwork  buy rating reiterated at br...
17    rt ratingsnetwork  buy rating reiterated a

In [151]:
temp=df.text

In [152]:
temp

0     RT @RatingsNetwork: Apple's outperform rating ...
1     Apple's outperform rating reiterated at RBC Ca...
2     Apple Inc. Reiterated Rating by RBC Capital Mk...
3     Apple: Bulls still pushing their luck as above...
4     Apple: Bulls pushing their luck as above the D...
5     RT @RatingsNetwork: Apple given $120.00 PT by ...
6     RT @RatingsNetwork: Apple given $120.00 PT by ...
7     RT @RatingsNetwork: Apple given $120.00 PT by ...
8     Apple Inc. Set Price Target by Morgan Stanley ...
9     Apple given $120.00 PT by Morgan Stanley. buy ...
10    Apple $AAPL Short-term Elliott Wave Analysis 5...
11    Apple's buy rating reiterated at BMO Capital M...
12    Apple Inc. Reiterated Rating by BMO Capital Ma...
13    Apple Inc. Upgrade by Pacific Crest Securities...
14    $AAPL (Apple Inc) is below its upper Bollinger...
15    $AAPL (Apple Inc) is at $97.90, below its Chan...
16    RT @RatingsNetwork: Apple's buy rating reitera...
17    RT @RatingsNetwork: Apple's buy rating rei

In [158]:
temp=temp.replace({'http.+\s':''},regex=True)

In [159]:
temp[7]

' given pt by morgan stanley buy rating '

In [161]:
temp=temp.replace({'[^A-Za-z#]':' '}, regex=True)

In [162]:
temp[7]

' given pt by morgan stanley buy rating '

In [163]:
temp=temp.replace({'\s+':' '}, regex=True)

In [164]:
temp=temp.replace({'\s.\s':' '}, regex=True)

In [165]:
temp=temp.map(lambda val: val.lower())

In [166]:
temp=temp.replace({'apple|#aapl|#apple|aapl':''}, regex=True)

In [168]:
#temp.replace({'http.+\s':'%%'}, regex=True)[7]
data['text']=temp

In [169]:
data.drop_duplicates(subset='text', keep='first')

Unnamed: 0,id,text,datetime,Location
0,735629055853137921,inc reiterated rating by rbc capital mkts ran...,2016-05-26 00:30:08,
1,735526940812140544,bulls still pushing their luck as above daily...,2016-05-25 17:44:22,
2,735480630973235201,bulls pushing their luck as above the daily r...,2016-05-25 14:40:21,
3,735470726589906946,rt ratingsnetwork given pt by morgan stanley b...,2016-05-25 14:01:00,
6,735447850268528640,inc set price target by morgan stanley ranked...,2016-05-25 12:30:06,
7,735446938204667905,given pt by morgan stanley buy rating,2016-05-25 12:26:28,
9,735383719498911746,short term elliott wave analysis dby,2016-05-25 08:15:16,
10,735351288179888128,buy rating reiterated at bmo capital markets,2016-05-25 06:06:23,
11,735349914889289729,inc reiterated rating by bmo capital markets ...,2016-05-25 06:00:56,
12,735349793598377985,inc upgrade by pacific crest securities ranke...,2016-05-25 06:00:27,


In [172]:
from textblob import TextBlob

In [177]:
#try pulling out the main concepts in all these tweets and then 
# run sentiment on them
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [262]:
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2), 
                             stop_words=['the','it','as','a','s','inc',
                                        'RT','its','llc','at','by','with'])

In [263]:
X = vectorizer.fit_transform(data.text)

In [264]:
lsa =TruncatedSVD(n_components=3, n_iter=100)

In [265]:
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=3, n_iter=100,
       random_state=None, tol=0.0)

In [266]:
import numpy as np

In [267]:
terms = vectorizer.get_feature_names()

In [277]:
pd.DataFrame(zip(np.repeat([1],len(lsa.components_[0])),terms,lsa.components_[1]), columns=['concept','terms','LSAscore'])

Unnamed: 0,concept,terms,LSAscore
0,1,above,2.844627e-02
1,1,above daily,2.937689e-02
2,1,above lower,8.467557e-04
3,1,act,4.772098e-04
4,1,act gvk,4.772098e-04
5,1,active,1.330517e-03
6,1,active expecting,1.330517e-03
7,1,agricole,-4.924377e-03
8,1,agricole pt,-4.024260e-03
9,1,agricole ranked,-1.768307e-03


In [286]:
# concept_df = pd.DataFrame(index=range(lsa.components_.shape[0]*lsa.components_.shape[1]),
#                           columns=['component','term','LSAscore'])

concept_df = pd.DataFrame(columns=['component','term','LSAscore'])
for i, comp in enumerate(lsa.components_):
    termsInComp = zip(np.repeat([i],len(comp)),terms,comp)
    concept_df = pd.concat([concept_df, 
                           pd.DataFrame(sorted(termsInComp, key=lambda x: x[2], reverse=True),
                                       columns=['component','term','LSAscore'])],
                           axis=0)
    
    
#     for j in range(concept_df.shape[0]):
#         concept_df['component'][j] = i
#         concept_df['term'][j]=sortedTerms[j][0]
#         concept_df['LSAscore'][j]=sortedTerms[j][1]

In [291]:
#concept_df has k concepts with terms that make each concept up, each concept sorted by the LSA score

Unnamed: 0,component,term,LSAscore
0,1.0,investing,0.252507
1,1.0,trading,0.252507
2,1.0,trading investing,0.252507
3,1.0,channel,0.243332
4,1.0,daily,0.241373
5,1.0,daily uptrend,0.239382
6,1.0,uptrend,0.239382
7,1.0,uptrend channel,0.239382
8,1.0,on shares,0.228424
9,1.0,shares,0.228424


In [325]:
#polarity, subjectivity of most important concept in sample AAPL tweets
senti=concept_df.loc[concept_df['component']==0,'term'].apply(lambda term: TextBlob(term).sentiment)[:50]

In [326]:
sentisum=0.0
for i in range(len(senti)):
    sentisum+=senti[i][0]
    
print sentisum

-0.15


In [302]:
senti[0][1]

0.0