## Scrape Twitter Data with Pandas

#### Using the `snscrape` library

In [2]:
# install snscrape library
# install tqdm for progressbar 
%pip install snscrape tqdm  --quiet # surpress the output

Note: you may need to restart the kernel to use updated packages.


In [3]:
# import necessary packages

import pandas as pd
import snscrape.modules.twitter as sntwitter
from tqdm.notebook import tqdm


In [4]:
# search query arg to be passed into the scraper object
search_query = "#python"

# create a scraper object 
scraper = sntwitter.TwitterSearchScraper(search_query)

scraper 

<snscrape.modules.twitter.TwitterSearchScraper at 0x166b97160>

In [5]:
# using get_items() method, explore the contents of a sample scraper object
for tweet in scraper.get_items():
    break 

In [6]:
tweet

Tweet(url='https://twitter.com/shapewaze/status/1618100475923947520', date=datetime.datetime(2023, 1, 25, 4, 16, 30, tzinfo=datetime.timezone.utc), rawContent='No. MMDCCLXXXIX\n#colorway #shapeways #drawing #processing #python #creativecoding #generativeart #jimbo https://t.co/IHzVubHDtM', renderedContent='No. MMDCCLXXXIX\n#colorway #shapeways #drawing #processing #python #creativecoding #generativeart #jimbo https://t.co/IHzVubHDtM', id=1618100475923947520, user=User(username='shapewaze', id=892212856510349312, displayname='Shapeways', rawDescription='Casually tweeting drawings of randomly composed shapes; arranged and colored at random.\n\n©2014-2022 JHubbell (All rights reserved)', renderedDescription='Casually tweeting drawings of randomly composed shapes; arranged and colored at random.\n\n©2014-2022 JHubbell (All rights reserved)', descriptionLinks=None, verified=False, created=datetime.datetime(2017, 8, 1, 2, 38, 16, tzinfo=datetime.timezone.utc), followersCount=57, friendsCount

In [7]:
type(tweet)

snscrape.modules.twitter.Tweet

### Accessing Tweet object attributes

In [8]:
# access the date
tweet.date

datetime.datetime(2023, 1, 25, 4, 16, 30, tzinfo=datetime.timezone.utc)

## Extract Tweets

In [9]:
scraper = sntwitter.TwitterSearchScraper(search_query)

tweets = []

for i,tweet in enumerate(scraper.get_items()):

    # create a data list 
    data = [
        tweet.date,
        tweet.id, 
        tweet.rawContent, 
        tweet.user.username, 
        tweet.likeCount, 
        tweet.retweetCount
    ]

    tweets.append(data)

    # add only the first 100 tweets
    if i > 100:
        break

In [10]:
len(tweets)

102

In [11]:
# convert the tweets list into a pandas DF
col = ['date', 'id', 'content', 'username', 'like_count', 'retweet_count']

tweet_df = pd.DataFrame(tweets, columns = col)

tweet_df


Unnamed: 0,date,id,content,username,like_count,retweet_count
0,2023-01-25 04:16:30+00:00,1618100475923947520,No. MMDCCLXXXIX\n#colorway #shapeways #drawing...,shapewaze,0,0
1,2023-01-25 04:16:27+00:00,1618100462812598272,No. MMDCCLXXXVIII\n#colorway #shapeways #drawi...,shapewaze,0,0
2,2023-01-25 04:16:24+00:00,1618100447184666624,No. MMCMLXVII\n#grayway #shapeways #drawing #p...,shapewaze,0,0
3,2023-01-25 04:16:20+00:00,1618100432181792770,"""Easy steps to build a LAMP stack in Linux!\n\...",linuxteck,1,1
4,2023-01-25 04:16:03+00:00,1618100359003529218,"Programming is 10% science, 20% ingenuity, and...",PadmashreeJha,2,1
...,...,...,...,...,...,...
97,2023-01-25 03:15:34+00:00,1618085138159800320,#datascience #MachineLearning #DataAnalytics #...,HalderNilimesh,1,0
98,2023-01-25 03:15:01+00:00,1618085001270071297,Adventures with Ai - Age of Discovery. Availa...,AmbassadorRico,1,3
99,2023-01-25 03:15:00+00:00,1618084998707572736,@s_gruppetta_ct @SaveToBookmarks #thread #pyth...,codygal005,0,0
100,2023-01-25 03:14:53+00:00,1618084969540194306,Download Minicoders and get started with codin...,TheNewStat1,1,2


In [12]:
# save df as csv file
tweet_df.to_csv("./data/raw/python_tweets.csv", index=False)

## Add a Progress Bar

In [20]:
search_query = "#chatgpt"

scraper = sntwitter.TwitterSearchScraper(search_query)

# number of tweets to be extracted

N_TWEETS = 1_000

tweets = []

# wrap the enumerate in tqdm and set the total to N_TWEETS
for i,tweet in tqdm(enumerate(scraper.get_items()), total=N_TWEETS):

    # create a data list 
    data = [
        tweet.date,
        tweet.id, 
        tweet.rawContent, 
        tweet.user.username, 
        tweet.likeCount, 
        tweet.retweetCount
    ]
    
    
    tweets.append(data)

    # add only the first 100 tweets
    if i > N_TWEETS:
        break

col = ['date', 'id', 'content', 'username', 'like_count', 'retweet_count']

tweet_df = pd.DataFrame(tweets, columns = col)

tweet_df.to_csv("./data/raw/chatgpt_tweets.csv", index=False)

  0%|          | 0/1000 [00:00<?, ?it/s]