In [17]:
import os
import pandas as pd
import tweepy
from tweepy import OAuthHandler
import json

In [18]:
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN')
ACCESS_TOKEN_SECRET = os.getenv('ACCESS_TOKEN_SECRET')
CONSUMER_KEY = os.getenv('CONSUMER_KEY')
CONSUMER_SECRET = os.getenv('CONSUMER_SECRET')

In [19]:
class fetchTweets():
    def __init__(self):
        try:
            auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
            auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
            
            self.api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

        except tweepy.TweepError as e:
            print(f'Error: Twitter Authentication Failed - {str(e)}')
            
    def get_tweets(self, screen_name):
        all_tweets = []
        
        new_tweets = self.api.user_timeline(screen_name=screen_name, count=100, tweet_mode='extended')
        
        all_tweets.extend(new_tweets)
        
        oldest = all_tweets[-1].id - 1
        
        while len(new_tweets) > 0:
            print(f"Getting tweets before {oldest}")
            
            new_tweets = self.api.user_timeline(screen_name=screen_name, count=100, max_id=oldest, tweet_mode='extended')
            
            all_tweets.extend(new_tweets)
            
            oldest = all_tweets[-1].id - 1
            
            print(f"{len(all_tweets)} tweets have been scraped")
        return all_tweets
    

In [20]:
twitter = fetchTweets()

In [21]:
tweets = twitter.get_tweets('midasIIITD')

Getting tweets before 1087386718485925887
200 tweets have been scraped
Getting tweets before 1037043189843091456
294 tweets have been scraped
Getting tweets before 1021377705084739583
294 tweets have been scraped


In [22]:
def save_jsonl(tweets):
    with open('tweets.jsonl', 'w') as f:
        for tweet in tweets:
            json.dump(tweet._json, f)
            f.write('\n')

In [23]:
save_jsonl(tweets)

In [39]:
def parse_jsonl(filename):
    tweets_dict = {}
    
    with open(filename) as f:
        for line in f:
            tweet = json.loads(line)
            images = tweet['entities'].get('media', [])
            tweets_dict.setdefault('text', []).append(tweet['full_text'])
            tweets_dict.setdefault('datetime', []).append(tweet['created_at'])
            tweets_dict.setdefault('favorite_count', []).append(tweet['favorite_count'])
            tweets_dict.setdefault('retweet_count', []).append(tweet['retweet_count'])
            tweets_dict.setdefault('media', []).append(len(images))

    return tweets_dict

In [40]:
def display_table(tweets_dict):
    df = pd.DataFrame.from_dict(tweets_dict)
    ordered_columns = ['text', 'datetime', 'favorite_count', 'retweet_count', 'media']
    df = df.reindex(columns=ordered_columns)
    df = df.replace('\n', ' ', regex=True)
    df['media'] = df['media'].map({0: 'None'}).fillna(df['media'])
    df['datetime'] = pd.to_datetime(table['datetime'])
    
    return df

In [41]:
parsed_tweets = parse_jsonl('tweets.jsonl')

In [42]:
table = display_table(parsed_tweets)

In [43]:
table

Unnamed: 0,text,datetime,favorite_count,retweet_count,media
0,BigMM 2019 : IEEE BigMM 2019 – Call for Worksh...,2019-03-18 02:27:47+00:00,2,0,
1,"Congratulations @midasIIITD team, Rohan, Prady...",2019-03-17 14:22:04+00:00,12,4,
2,We have emailed the task details to all shortl...,2019-03-16 14:06:56+00:00,6,0,
3,IEEE BigMM 2019 - Call for Workshop Proposals....,2019-03-16 09:20:29+00:00,1,1,
4,"Congratulations! Arijit, Ramit, @debanjanbhucs...",2019-03-16 09:14:58+00:00,6,2,
5,We will be releasing a very interesting task t...,2019-03-16 05:13:14+00:00,7,2,
6,RT @hcdiiitd: Last day to register for #Portfo...,2019-03-13 17:09:44+00:00,0,2,
7,@ACMMM19 @sigmm @TheOfficialACM @acmmmsys @ACM...,2019-03-13 04:11:24+00:00,1,0,1
8,RT @ACMMM19: The paper deadline is approaching...,2019-03-13 04:06:04+00:00,0,13,
9,RT @kaggle: Bookmark this amazing library of i...,2019-03-12 17:43:44+00:00,0,69,
