In [26]:
import requests
import pandas as pd
import time
from langdetect import detect

# --- Twitter API Setup ---
BEARER_TOKEN = 'AAAAAAAAAAAAAAAAAAAAAKEq1wEAAAAAYhfmwZJz856UCB5qRAsJSL3e6YA%3De3nuxkHSWfEFpuUZ9wTnM9EdvuhOXXRdnfRJsyuXOuPkpfavyw'
headers = {'Authorization': f'Bearer {BEARER_TOKEN}'}
search_url = 'https://api.twitter.com/2/tweets/search/recent'

# --- Query Parameters ---
query = '(transport OR fare OR public transport) Rwanda'
params = {
    'query': query,
    'max_results': 100,
    'tweet.fields': 'created_at,text,author_id,lang'
}

all_tweets = []
next_token = None

# --- Fetch Tweets with Pagination and Retry ---
for _ in range(10):  # up to ~1000 tweets
    if next_token:
        params['next_token'] = next_token

    response = requests.get(search_url, headers=headers, params=params)

    if response.status_code == 429:
        print("Rate limit hit. Waiting 15 minutes...")
        time.sleep(15 * 60)
        continue

    result = response.json()

    if 'data' in result:
        all_tweets.extend(result['data'])
    else:
        print("No data returned:", result)
        break

    next_token = result.get('meta', {}).get('next_token')
    if not next_token:
        break

    time.sleep(1)

# --- Convert to DataFrame ---
df = pd.DataFrame(all_tweets)

if df.empty:
    print("No tweets collected.")
else:
    # --- Language Filtering ---
    df["lang"] = df["text"].apply(lambda x: detect(x) if isinstance(x, str) else "unknown")
    df = df[df["lang"] == "en"]

    # --- Cleanup and Save ---
    df.rename(columns={'created_at': 'date', 'author_id': 'user'}, inplace=True)
    df = df[["date", "user", "text"]]
    df.to_csv("data/expanded_tweets.csv", index=False)
    print(f"✅ Saved {len(df)} English tweets to data/expanded_tweets.csv")

    print(df.head())


Rate limit hit. Waiting 15 minutes...


KeyboardInterrupt: 

In [25]:
import requests
import pandas as pd
import time
import re
from langdetect import detect
from deep_translator import GoogleTranslator

# --- Twitter API Setup ---
BEARER_TOKEN = 'AAAAAAAAAAAAAAAAAAAAAKEq1wEAAAAAYhfmwZJz856UCB5qRAsJSL3e6YA%3De3nuxkHSWfEFpuUZ9wTnM9EdvuhOXXRdnfRJsyuXOuPkpfavyw'
headers = {'Authorization': f'Bearer {BEARER_TOKEN}'}
search_url = 'https://api.twitter.com/2/tweets/search/recent'

# --- Query Parameters ---
query = '("transport fare" OR "bus fare" OR "fare increase" OR "distance based fare") Rwanda'
params = {
    'query': query,
    'max_results': 100,
    'tweet.fields': 'created_at,text,author_id,lang'
}

MAX_TWEETS = 30
all_tweets = []
next_token = None

# --- Country Filter: Drop tweets mentioning other countries ---
EXCLUDE_COUNTRIES = [
    'kenya', 'uganda', 'tanzania', 'nigeria', 'ghana', 'ethiopia', 'congo',
    'burundi', 'zimbabwe', 'south africa', 'zambia', 'somalia', 'mali', 'senegal',
    'morocco', 'algeria', 'libya', 'cameroon', 'angola', 'botswana', 'sudan',
    'egypt', 'tunisia', 'malawi', 'namibia'
]

def mentions_other_country(text):
    text_lower = text.lower()
    return any(country in text_lower for country in EXCLUDE_COUNTRIES)

# --- Fetch Tweets with Pagination and Retry ---
while len(all_tweets) < MAX_TWEETS:
    if next_token:
        params['next_token'] = next_token

    response = requests.get(search_url, headers=headers, params=params)

    if response.status_code == 429:
        print("Rate limit hit. Waiting 15 minutes...")
        time.sleep(15 * 60)
        continue

    result = response.json()

    if 'data' in result:
        batch = result['data']
        all_tweets.extend(batch)
        if len(all_tweets) >= MAX_TWEETS:
            break
    else:
        print("No data returned:", result)
        break

    next_token = result.get('meta', {}).get('next_token')
    if not next_token:
        break

    time.sleep(1)

# --- Convert to DataFrame ---
df = pd.DataFrame(all_tweets)

if df.empty:
    print("No tweets collected.")
else:
    # Remove tweets that mention other countries
    df = df[~df["text"].apply(mentions_other_country)]

    # Language Detection
    df["lang"] = df["text"].apply(lambda x: detect(x) if isinstance(x, str) else "unknown")

    # Separate by language
    df_en = df[df["lang"] == "en"].copy()
    df_rw = df[df["lang"] == "rw"].copy()

    # Translate Kinyarwanda tweets
    def safe_translate(text):
        try:
            return GoogleTranslator(source='auto', target='en').translate(text)
        except Exception as e:
            print("Translation error:", e)
            return ""

    if not df_rw.empty:
        print(f"Translating {len(df_rw)} Kinyarwanda tweets...")
        df_rw["translated_text"] = df_rw["text"].apply(safe_translate)
    else:
        df_rw["translated_text"] = []

    # English tweets don’t need translation
    df_en["translated_text"] = df_en["text"]

    # Merge and clean
    final_df = pd.concat([df_en, df_rw], ignore_index=True)
    final_df.rename(columns={'created_at': 'date', 'author_id': 'user'}, inplace=True)
    final_df = final_df[["date", "user", "text", "translated_text", "lang"]]
    final_df["date"] = pd.to_datetime(final_df["date"]).dt.tz_localize(None)

    final_df.to_csv("data/expanded_tweets.csv", index=False)
    print(f"✅ Saved {len(final_df)} tweets after filtering.")
    print(final_df.head())


Rate limit hit. Waiting 15 minutes...


KeyboardInterrupt: 

In [19]:
!pip install deep_translator


Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
Installing collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [30]:
import tweepy
import pandas as pd

# Replace with your own credentials
bearer_token = 'AAAAAAAAAAAAAAAAAAAAAKEq1wEAAAAAYhfmwZJz856UCB5qRAsJSL3e6YA%3De3nuxkHSWfEFpuUZ9wTnM9EdvuhOXXRdnfRJsyuXOuPkpfavyw'

client = tweepy.Client(bearer_token=bearer_token)

query = '("transport fare" OR "bus fare" OR "fare increase" OR "distance based fare") Rwanda -is:retweet'
tweets = client.search_recent_tweets(query=query, tweet_fields=['created_at', 'text', 'author_id', 'lang'], max_results=100)

data = []
for tweet in tweets.data:
    data.append({
        'date': tweet.created_at,
        'user': tweet.author_id,
        'text': tweet.text,
        'lang': tweet.lang
    })

df = pd.DataFrame(data)
df.to_csv('transport_fare_tweets.csv', index=False)


TooManyRequests: 429 Too Many Requests
Usage cap exceeded: Monthly product cap

In [28]:
!pip install tweepy

Collecting tweepy
  Downloading tweepy-4.15.0-py3-none-any.whl.metadata (4.1 kB)
Collecting oauthlib<4,>=3.2.0 (from tweepy)
  Downloading oauthlib-3.2.2-py3-none-any.whl.metadata (7.5 kB)
Collecting requests-oauthlib<3,>=1.2.0 (from tweepy)
  Downloading requests_oauthlib-2.0.0-py2.py3-none-any.whl.metadata (11 kB)
Downloading tweepy-4.15.0-py3-none-any.whl (99 kB)
Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)
Downloading requests_oauthlib-2.0.0-py2.py3-none-any.whl (24 kB)
Installing collected packages: oauthlib, requests-oauthlib, tweepy
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [tweepy]
[1A[2KSuccessfully installed oauthlib-3.2.2 requests-oauthlib-2.0.0 tweepy-4.15.0
