# Playing with Tweepy

In [37]:
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import tweepy
import json
import pandas as pd
import csv
import re
from textblob import TextBlob
import string
import preprocessor as p
import os
import time

## Set credentials and server configurations

In [38]:
# Import keys from a saved file instead of inputting it directly into the script

key_location = "/home/andrew/twitter.keys"
apikeys = []
with open(key_location) as keys:
    for i in keys:
        apikeys.append(i.split("=")[1].strip(" ").strip("\n"))
keys.close()

## Initialize server and authentication configurations

In [39]:
# Set elasticsearch server
es = elasticsearch.Elasticsearch([{"host":"localhost","port":9200}])

# Initialize dictionary
twitter_cred = dict()

# Enter API keys
twitter_cred["CONSUMER_KEY"] = apikeys[0]
twitter_cred["CONSUMER_SECRET"] = apikeys[1]

# Access Tokens
twitter_cred["ACCESS_KEY"] = apikeys[2]
twitter_cred["ACCESS_SECRET"] = apikeys[3]

auth = tw.OAuthHandler(twitter_cred["CONSUMER_KEY"], twitter_cred["CONSUMER_SECRET"])
auth.set_access_token(twitter_cred["ACCESS_KEY"], twitter_cred["ACCESS_SECRET"])
api = tw.API(auth, wait_on_rate_limit=True)

Initially, I searched for 1 keyword, but the more I research, it looks like ~900 calls are permitted every 15 minutes is the norm for free-tier developer accounts.  In a blog post I was reading, I see this guy created a function that extracted 2500 tweets per 15 minutes based on a list of keywords.  I like the list of keywords ideas.  I can probably put a cronjob on the script for data acquisition.  Possibly use this for my stock analysis.

According to the blog post, I can only extract 7 days worth of information

1. Search parameter
2. Start date (todays_date - 7)
3. number of tweets to pull (2500)
4. number of runs that happen (once every 15 minutes)

In [40]:
def scraptweets(search_words, date_since, numTweets, numRuns):
    
    # Define a for-loop to generate tweets at regular intervals
    # We cannot make large API call in one go. Hence, let's try T times
    
    # Define a pandas dataframe to store the date:
    db_tweets = pd.DataFrame(columns = ['username', 'acctdesc', 'location', 'following',
                                        'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts',
                                        'retweetcount', 'text', 'hashtags']
                                )
    program_start = time.time()
    for i in range(0, numRuns):
        # We will time how long it takes to scrape tweets for each run:
        start_run = time.time()
        
        # Collect tweets using the Cursor object
        # .Cursor() returns an object that you can iterate or loop over to access the data collected.
        # Each item in the iterator has various attributes that you can access to get information about each tweet
        tweets = tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since, tweet_mode='extended').items(numTweets)
# Store these tweets into a python list
        tweet_list = [tweet for tweet in tweets]
# Obtain the following info (methods to call them out):
        # user.screen_name - twitter handle
        # user.description - description of account
        # user.location - where is he tweeting from
        # user.friends_count - no. of other users that user is following (following)
        # user.followers_count - no. of other users who are following this user (followers)
        # user.statuses_count - total tweets by user
        # user.created_at - when the user account was created
        # created_at - when the tweet was created
        # retweet_count - no. of retweets
        # (deprecated) user.favourites_count - probably total no. of tweets that is favourited by user
        # retweeted_status.full_text - full text of the tweet
        # tweet.entities['hashtags'] - hashtags in the tweet
# Begin scraping the tweets individually:
        noTweets = 0
    for tweet in tweet_list:
# Pull the values
        username = tweet.user.screen_name
        acctdesc = tweet.user.description
        location = tweet.user.location
        following = tweet.user.friends_count
        followers = tweet.user.followers_count
        totaltweets = tweet.user.statuses_count
        usercreatedts = tweet.user.created_at
        tweetcreatedts = tweet.created_at
        retweetcount = tweet.retweet_count
        hashtags = tweet.entities['hashtags']
    try:
        text = tweet.retweeted_status.full_text
    except AttributeError:  # Not a Retweet
        text = tweet.full_text
# Add the 11 variables to the empty list - ith_tweet:
        ith_tweet = [username, acctdesc, location, following, followers, totaltweets,
                     usercreatedts, tweetcreatedts, retweetcount, text, hashtags]
# Append to dataframe - db_tweets
        db_tweets.loc[len(db_tweets)] = ith_tweet
# increase counter - noTweets  
        noTweets += 1
        
        # Run ended:
        end_run = time.time()
        duration_run = round((end_run-start_run)/60, 2)
        
        print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
        print('time take for {} run to complete is {} mins'.format(i+1, duration_run))
        
        time.sleep(920) #15 minute sleep time
# Once all runs have completed, save them to a single csv file:
    from datetime import datetime
    
    # Obtain timestamp in a readable format
    to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S')

# Define working path and filename
    path = os.getcwd()
    filename = to_csv_timestamp + '_tweets.csv'
# Store dataframe in csv with creation date timestamp
    db_tweets.to_csv(filename, index = False)
    
    program_end = time.time()
    print('Scraping has completed!')
    print('Total time taken to scrap is {} minutes.'.format(round(program_end - program_start)/60, 2))

In [41]:
# Initialise these variables:
search_words = "#palintir OR #PLTR"
date_since = "2020-10-01"
numTweets = 100
numRuns = 1
# Call the function scraptweets
scraptweets(search_words, date_since, numTweets, numRuns)

Scraping has completed!
Total time taken to scrap is 0.8 minutes.


# Reference
The base code (all of it except for the authentication piece) was sourced from: 

**Griffin Leow**, [Scraping tweets with Tweepy Python](https://medium.com/@leowgriffin/scraping-tweets-with-tweepy-python-59413046e788)

In [42]:
import tweepy

In [43]:
# Set elasticsearch server
es = elasticsearch.Elasticsearch([{"host":"localhost","port":9200}])

# Initialize dictionary
twitter_cred = dict()

# Enter API keys
twitter_cred["CONSUMER_KEY"] = apikeys[0]
twitter_cred["CONSUMER_SECRET"] = apikeys[1]

# Access Tokens
twitter_cred["ACCESS_KEY"] = apikeys[2]
twitter_cred["ACCESS_SECRET"] = apikeys[3]

auth = tw.OAuthHandler(twitter_cred["CONSUMER_KEY"], twitter_cred["CONSUMER_SECRET"])
auth.set_access_token(twitter_cred["ACCESS_KEY"], twitter_cred["ACCESS_SECRET"])
api = tw.API(auth, wait_on_rate_limit=True)

In [45]:
public_tweets = api.home_timeline()
for tweet in public_tweets:
    print(tweet.text)

RT @TrustlessState: Ethereum vs Moloch

Listen in to @BanklessHQ Pod tomorrow https://t.co/EYd6siXGCC
Cognitive/Artificial Intelligence Systems Market 2020 | Know the Latest COVID19 Impact Analysis .... #industry40… https://t.co/XfleZNyqbi
Follow our @CertifyGIAC blog for news, career advice and insights!

Keep your career on the right track during the… https://t.co/g8sv0UmTr0
[Course Video] 64-bit Assembly Language &amp; Shellcoding: HelloWorld Shellcode JMP-CALL-POP Technique… https://t.co/EcylCvIJA3
Part 2 of the fireside chat between @omgnetworkhq and @curvegrid that’s for the ages! Kick back and learn everythin… https://t.co/230t6aNzsf
It’s technical, but it’s worth it! Learn what Javascripts were used to build the @reddit Community Points Engine an… https://t.co/Qn5fA87bkP
"The growth of the #Bitcoin network, meaning the number of active users and transactions, has stalled in the near t… https://t.co/kodFzWlXaO
Times of India @timesofindia: AI: A force for social empowerment. #AI

In [46]:
user = api.get_user("watch717")

In [47]:
user

User(_api=<tweepy.api.API object at 0x7f34f08fb310>, _json={'id': 903449640506245122, 'id_str': '903449640506245122', 'name': 'CryptoWatch717', 'screen_name': 'watch717', 'location': '', 'profile_location': None, 'description': '', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 4, 'friends_count': 111, 'listed_count': 0, 'created_at': 'Fri Sep 01 02:49:14 +0000 2017', 'favourites_count': 6, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 4, 'lang': None, 'status': {'created_at': 'Wed Apr 11 15:35:44 +0000 2018', 'id': 984092645117935616, 'id_str': '984092645117935616', 'text': "@rickymagnussen @tenxwallet @LTCFoundation @BlockfolioApp I've been waiting 8 months.  Still nothing.", 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'rickymagnussen', 'name': 'Ricky Magnussen', 'id': 777107774543134721, 'id_str': '777107774543134721', 'indices': [

In [48]:
type(user)

tweepy.models.User

In [49]:
dir(user)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_api',
 '_json',
 'contributors_enabled',
 'created_at',
 'default_profile',
 'default_profile_image',
 'description',
 'entities',
 'favourites_count',
 'follow',
 'follow_request_sent',
 'followers',
 'followers_count',
 'followers_ids',
 'following',
 'friends',
 'friends_count',
 'geo_enabled',
 'has_extended_profile',
 'id',
 'id_str',
 'is_translation_enabled',
 'is_translator',
 'lang',
 'listed_count',
 'lists',
 'lists_memberships',
 'lists_subscriptions',
 'location',
 'name',
 'needs_phone_verification',
 'notifications',
 'parse',
 'parse_list',
 'profile_background_color',
 'profile_backgrou

In [50]:
user.lists


<bound method User.lists of User(_api=<tweepy.api.API object at 0x7f34f08fb310>, _json={'id': 903449640506245122, 'id_str': '903449640506245122', 'name': 'CryptoWatch717', 'screen_name': 'watch717', 'location': '', 'profile_location': None, 'description': '', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 4, 'friends_count': 111, 'listed_count': 0, 'created_at': 'Fri Sep 01 02:49:14 +0000 2017', 'favourites_count': 6, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 4, 'lang': None, 'status': {'created_at': 'Wed Apr 11 15:35:44 +0000 2018', 'id': 984092645117935616, 'id_str': '984092645117935616', 'text': "@rickymagnussen @tenxwallet @LTCFoundation @BlockfolioApp I've been waiting 8 months.  Still nothing.", 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'rickymagnussen', 'name': 'Ricky Magnussen', 'id': 777107774543134721, 'id_str': '77710

In [51]:
user.screen_name

'watch717'

In [52]:
user.followers_count

4

In [53]:
for friend in user.friends():
    print(friend.screen_name)

Princeofspa
ClubbingPattaya
mikeiacovacci
ArtIntelligenc
welcomeai
mycroft_ai
brendantierney
nordicinst
MsftSecIntel
threatpost
SecurityWeek
CSOonline
SCMagazine
kaspersky
TheHackersNews
PimpBangkok
drericcole
netresec
SANSAPAC
BarackObama
