In [None]:
import os
import pandas as pd
import csv
import tweepy
from langdetect import detect

In [None]:
# Access API credentials from the environment
with open('D:/gpt/text_suicide_detection/text_suicide_detection/api_access.env') as file:
    for line in file:
        if line.startswith('#') or not line.strip():
            continue
        key, value = line.strip().split('=', 1)
        os.environ[key] = value

consumer_key        = os.getenv('consumer_key')
consumer_secret     = os.getenv('consumer_secret')
access_token        = os.getenv('access_token')
access_token_secret = os.getenv('access_token_secret')

# Authenticate Twitter API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth,  wait_on_rate_limit=True)

In [None]:
def fetch_tweets(query, count):
    # Initialize an empty list to store tweets
    tweets = []

    # Using Tweepy Cursor to fetch tweets
    for tweet in tweepy.Cursor(api.search_tweets, q=query, lang="id", tweet_mode="extended", result_type="mixed").items(count):
        tweets.append(tweet._json)

    # Create a DataFrame from the collected tweets
    df = pd.DataFrame(data=[tweet["full_text"] for tweet in tweets], columns=['tweets'])

    return df

# Example usage
query = "shock OR cape OR lelah OR putus asa -filter:retweets"
count = 100  # Specify the number of tweets you want to fetch
df = fetch_tweets(query, count)

In [None]:
def remove_non_indonesian(df, column_name):
    # Ensure that the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame")

    # Detect language and filter in one step for efficiency
    is_indonesian = df[column_name].apply(langdetect.detect).eq('id')

    # Return the filtered DataFrame, keeping only the original columns
    return df[is_indonesian]

df['tweets'] = df['tweets'].astype(str)
df = remove_non_indonesian(df, 'tweets')

In [None]:
# Export dataframe to csv
df.to_csv("crawled_tweets.csv", quoting=csv.QUOTE_NONNUMERIC, index=False)