In [1]:
# !pip install twikit

In [1]:
import pandas as pd
import asyncio
import random
import os
import nest_asyncio 
from dotenv import load_dotenv
from twikit import Client

In [2]:
nest_asyncio.apply()
load_dotenv()
USERNAME = os.getenv("USERNAME")
EMAIL = os.getenv("EMAIL")
PASSWORD = os.getenv("PASSWORD")
client = Client('en-US')

In [3]:
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
stopword = StopWordRemoverFactory().create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()
def preprocessText(text):
    text = str(text)

    # change text to lowercase
    text = text.lower()

    # change link with http/https patterns
    text = re.sub(r'http\S+', '', text)

    # remove hashtag and username
    text = re.sub(r'(@\w+|#\w+)', '', text)

    # remove character other than a-z and A-Z
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)

    # replace new line '\n' with space
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\t', ' ', text)
    # remove stopword with sastrawi library
    text = stopword.remove(text)

    # do stemming with sastrawi library
    text = stemmer.stem(text)

    # removing more than one space
    text = re.sub(r'\s{2,}', ' ', text)

    return text

In [4]:
async def search_and_collect_tweets(query):
    COOKIE_FILE = 'cookies.json'
    tweet_data = []

    try:
        if os.path.exists(COOKIE_FILE):
            client.load_cookies(COOKIE_FILE)
            print("Loaded existing cookies")
        else:
            await client.login(
                auth_info_1=USERNAME,
                auth_info_2=EMAIL,
                password=PASSWORD
            )
            client.save_cookies(COOKIE_FILE)
            print("Saved new cookies")
    except Exception as e:
        print(f"Login failed: {str(e)}")
        return pd.DataFrame()

    page = await client.search_tweet(query=query, product='Latest', count=20)


    page_count = 0
    MAX_PAGES = 30
    MAX_TWEETS = 130

    while page:
        page_count += 1
        print(f"Processing page {page_count} (current total tweets: {len(tweet_data)})")

        # Append this page’s tweets
        for tweet in page:
            tweet_data.append({
                'title': 'Pantai Sanur Bali Tweets',
                'source': 'twitter',
                'url': f"x.com/{tweet.user.screen_name}/status/{tweet.id}",
                'date': tweet.created_at,
                'content': preprocessText(tweet.text),
                'image': tweet.thumbnail_url,
            })

        # Check stopping conditions BEFORE fetching next page
        if len(tweet_data) >= MAX_TWEETS:
            print(f"Reached {len(tweet_data)} tweets; stopping.")
            break
        if page_count >= MAX_PAGES:
            print(f"Fetched {page_count} pages; stopping.")
            break

        # Throttle
        sleep_secs = random.uniform(10, 20)
        print(f"Sleeping for {sleep_secs:.1f}s before next page...")
        await asyncio.sleep(sleep_secs)

        # Try to fetch the next page; break if none or error
        try:
            page = await page.next()
            if not page:
                print("No more pages; stopping.")
                break
        except Exception as e:
            print(f"Pagination stopped (blocked or error): {e}")
            break

    df = pd.DataFrame(tweet_data)

    return df

In [5]:
today = pd.to_datetime('today').strftime('%Y-%m-%d')
sincedate = '2025-05-08'
untildate = '2025-05-08'
query = "pantai sanur bali"
query_today = f"pantai sanur bali since:{today}"
query_sinceuntildate = f"pantai sanur bali since:{sincedate} until:{today}"
query_sincedate = f"pantai sanur bali since:{sincedate}"
query_untildate = f"pantai sanur bali until:{untildate}"
# Format since until contoh: pantai sanur until:2025-07-14 since:2025-05-08

In [6]:
async def main():
    df_tweets = await search_and_collect_tweets(query)
    return df_tweets

In [7]:
df_tweets = await main()

Login failed: Kami memblokir upaya untuk mengakses akun Anda karena kami tidak sepenuhnya yakin itu benar-benar Anda.

Ini terjadi ketika kami mendapati aktivitas masuk yang tidak biasa, seperti upaya masuk yang terlalu banyak, atau berasal dari lokasi atau perangkat yang berbeda.

Anda perlu menunggu untuk mencoba masuk lagi. Sebagian pemblokiran dicabut secara otomatis.

Dapatkan bantuan terkait masalah masuk.


In [9]:
len(df_tweets)

147

In [10]:
df_tweets

Unnamed: 0,url,created_at,text,image
0,x.com/nusabalicom/status/1920689716120240244,Fri May 09 03:58:07 +0000 2025,banyak lima negara ikut juara horseback archer...,https://pbs.twimg.com/card_img/192068951878259...
1,x.com/SejarahBali/status/1920382344390672614,Thu May 08 07:36:44 +0000 2025,suasana pantai sanur tahun luna maya kamasutra...,
2,x.com/Ditpolairudbali/status/1920297413496934562,Thu May 08 01:59:15 +0000 2025,patroli pagi personil bgp atv airud bal pesisi...,
3,x.com/officialinews_/status/1920132117771477296,Wed May 07 15:02:25 +0000 2025,juara kuda manah gelar pantai sanur bal ikut n...,https://pbs.twimg.com/card_img/192013211876973...
4,x.com/NgurahSuryaKu/status/1919577023409951208,Tue May 06 02:16:41 +0000 2025,ngintip sunset pantai sanur bal,
...,...,...,...,...
142,x.com/teman_bus/status/1865214649215651894,Sat Dec 07 02:00:01 +0000 2024,bal satu halte pas banget nemenin kamu siang j...,
143,x.com/hyrupskuyliving/status/1864181793207013688,Wed Dec 04 05:35:48 +0000 2024,bal ituuu pantai sanur,
144,x.com/WiraTourTravel/status/1863479463109145036,Mon Dec 02 07:05:00 +0000 2024,libur keluarga pantai sanur pilih sempurna omb...,https://pbs.twimg.com/card_img/191829378905320...
145,x.com/WiraTourTravel/status/1863478959670755550,Mon Dec 02 07:03:00 +0000 2024,pantai sanur kenal pantai matahari terbit mula...,


In [11]:
csv_file = 'tweets_data.csv'

if os.path.exists(csv_file):
    old_df = pd.read_csv(csv_file)
    combined_df = pd.concat([df_tweets, old_df], ignore_index=True)
else:
    combined_df = df_tweets

combined_df.to_csv(csv_file, index=False)