# Scraping Twitter with twint

https://github.com/twintproject/twint

In [None]:
# Check if twint is already installed
!pip list | grep twint

In [None]:
# https://github.com/twintproject/twint/issues/915

# Run this command first, then restart runtime, you can import twint after that
!pip3 install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint

Restart runtime then clone twint from github and install requirements

In [None]:
!git clone https://github.com/twintproject/twint.git

In [None]:
!cd /content/twint && pip3 install . -r requirements.txt

In [11]:
import twint

# Solve compatibility issues with notebooks and RunTime errors
import nest_asyncio
nest_asyncio.apply()

import pandas as pd

# Extract tweets for years 2019, 2020, 2021 and save them to .csv, named by search terms

In [4]:
# Create a list of terms related to gender based violence
glossary = ['sexual abuse', 'sexual exploitation', 'rape', 'sexual assault', 
            'exploitative relationship', 'human rights violation', 'sexual violence', 
            'gender based violence', 'violence against women', 'sexual harassment', 
            'domestic_abuse', 'domestic violence','domesticviolence', 'genderbasedviolence', 
            'GBV', 'violenceagainstwomen', 'sexualabuse','humanrightsviolation', 
            'cyberbullying','genderequality', 'endgbv', 'metoo', 'covid', 'coronavirus', 
            'survivors', 'feminism', 'stopgbv', 'enoughisenough', 'femicide', 
            'arewametoo','consent', 'mentalhealth', 'safety', 'domesticabuse']

Twint configuration options:

https://github.com/twintproject/twint/wiki/Configuration

In [8]:
# available columns when saving to pandas
twint.output.panda.Tweets_df.columns

Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
       'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
       'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
       'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
       'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'],
      dtype='object')

Nigeria occupies an area of 923,768 sq. km, extending 1,127 km E–W and 1,046 km N–S.

*    Geographic center of Nigeria: 9.081999, 8.675277
*    Radius: ~ 550 km


In [27]:
def get_tweets(search, since, until):
    c = twint.Config()
    c.Search = search
    print(search)
    c.Since = since
    c.Until = until
    c.Geo = '9.081999, 8.675277, 550km'
    c.Show_hashtags = True
    c.Count = True
    c.Lowercase = True
    c.Filter_retweets = True
    c.Pandas = True
    twint.run.Search(c)
    Tweets_df = twint.storage.panda.Tweets_df
    Tweets_df.to_csv(f'{search}.csv')

## Scrape one keyword

In [None]:
get_tweets('GBV', '2021-05-01', '2021-06-01')

In [None]:
# Check if tweets were stored to .csv
pd.read_csv('GBV.csv').sort_values('date')

## Scrape multiple keywords

Twint only allows the search for one term or Twitter user at a time. This means we need to create a loop to go through all the the keywords.

In [None]:
[get_tweets(i, '2021-05-01', '2021-06-01') for i in glossary]

In [None]:
# Read all .csv's
[pd.read_csv(f'{i}.csv') for i in glossary]

# Save everything to Google Drive

In [42]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [114]:
# One keyword
GBV = pd.read_csv('GBV.csv').sort_values('date')

GBV.to_csv('GBV.csv')
!cp GBV.csv "drive/My Drive/"

In [113]:
# Multiple keywords

for i in glossary:
    try:
        f = pd.read_csv(f'{i}.csv').sort_values('date')
        f.to_csv(f'{i}.csv')
        !cp {i}.csv 'drive/My Drive/'
        
    except:
        f = pd.read_csv(f'{i}.csv')
        f.to_csv(f'{i}.csv')
        !cp {i}.csv 'drive/My Drive/'