In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import snscrape.modules.twitter as sntwitter

In [3]:
def get_data_twitter(query, limits):
    # Initialize the tweets list and max_id
    tweets= []

    # Fetch tweets based on query with a limit of "limits"
    for i in range(limits // 100 + 1):  # 100 tweets per request
        scrape = sntwitter.TwitterSearchScraper(query)
        for tweet in scrape.get_items():
            if len(tweets) >= limits:
                break
            tweet_dict = {
                "date": tweet.date,
                "content": tweet.rawContent,
                "username": tweet.user.username,
                "tweet_url": tweet.url,
                "reply_count": tweet.replyCount,
                "retweet_count": tweet.retweetCount,
                "like_count": tweet.likeCount,
                "verified": tweet.user.verified,
                "followers": tweet.user.followersCount
            }
            tweets.append(tweet_dict)
            
    # Convert list of tweets to pandas dataframe
    df_tweets = pd.DataFrame(
        tweets,
        columns=[
            "date",
            "content",
            "username",
            "tweet_url",
            "reply_count",
            "retweet_count",
            "like_count",
            "verified",
            "followers"
        ],
    )
    return df_tweets


In [4]:
query = "THR min_replies:1 min_faves:1 min_retweets:1 lang:id since:2023-01-01 -filter:links"
limits = 200

df_raw = get_data_twitter(query, limits)

In [5]:
df_raw.head()

Unnamed: 0,date,content,username,tweet_url,reply_count,retweet_count,like_count,verified,followers
0,2023-04-08 15:01:24+00:00,Dr Piutang THR\nDr Piutang Gaji\n Cr Pen...,txtdrakuntansi,https://twitter.com/txtdrakuntansi/status/1644...,1,6,11,False,59697
1,2023-04-08 14:52:40+00:00,@islabellecoco @gojekindonesia Sedikit curhat ...,gummypark61,https://twitter.com/gummypark61/status/1644714...,1,1,1,False,14
2,2023-04-08 14:45:03+00:00,rep dibawah sini yg mau spay thr receh 1.000 u...,yufada_,https://twitter.com/yufada_/status/16447129669...,26,1,1,False,113
3,2023-04-08 13:39:44+00:00,Selamat kepada :\n@emirahay82\n@0M_YANT0\n@Mis...,mindaart,https://twitter.com/mindaart/status/1644696530...,12,1,5,False,1551
4,2023-04-08 12:51:00+00:00,ak supres pake link thr sama daget yh yg ke 4,haelovelychan,https://twitter.com/haelovelychan/status/16446...,18,2,35,False,57379


In [6]:
import re
import string

def clean_tweet(tweet):
    """
    Fungsi untuk melakukan cleansing pada tweet berbahasa Indonesia.

    Parameters:
    tweet (str): Tweet yang akan dibersihkan.

    Returns:
    str: Tweet yang telah dibersihkan.
    """

    # Menghapus username, hashtag, dan link
    tweet = re.sub(r"(?:\@|\#|http)\S+", "", tweet)

    # Menghapus tanda baca
    tweet = tweet.translate(str.maketrans("", "", string.punctuation))

    # Mengubah huruf menjadi lowercase
    tweet = tweet.lower()

    # # Menghapus angka
    # tweet = re.sub(r"\d+", "", tweet)

    # Menghapus whitespace dan karakter yang tidak diperlukan
    tweet = re.sub(r"\s+", " ", tweet)
    tweet = tweet.strip()

    # Menghapus kata-kata dengan panjang kurang dari 1 karakter namun angka masih masuk
    tweet = " ".join(word for word in tweet.split() if (len(word) > 1 or word.isdigit()))

    return tweet

In [7]:
df_raw['content_clean'] = df_raw['content'].apply(clean_tweet)

In [8]:
df_raw[['content', 'content_clean']].head()

Unnamed: 0,content,content_clean
0,Dr Piutang THR\nDr Piutang Gaji\n Cr Pen...,dr piutang thr dr piutang gaji cr pendapatan j...
1,@islabellecoco @gojekindonesia Sedikit curhat ...,sedikit curhat bahkan pas korona aja pernah ma...
2,rep dibawah sini yg mau spay thr receh 1.000 u...,rep dibawah sini yg mau spay thr receh 1000 un...
3,Selamat kepada :\n@emirahay82\n@0M_YANT0\n@Mis...,selamat kepada pemenang ga thr masing2 100 spa...
4,ak supres pake link thr sama daget yh yg ke 4,ak supres pake link thr sama daget yh yg ke 4


In [27]:
from googletrans import Translator
from tqdm.auto import tqdm

translator = Translator()
translation = {}

for element in tqdm(df_raw.content_clean):
    try:
        # translate each element and store it in the dictionary translation
        translation[element] = translator.translate(element, dest='en').text
    except Exception as e:
        print(f"An error occurred while translating {element}: {e}")


100%|██████████| 200/200 [04:11<00:00,  1.26s/it]


In [31]:
df_raw['content_translated'] = df_raw['content_clean'].map(translation)

In [33]:
df_raw.head()

Unnamed: 0,date,content,username,tweet_url,reply_count,retweet_count,like_count,verified,followers,content_clean,content_translated
0,2023-04-08 15:01:24+00:00,Dr Piutang THR\nDr Piutang Gaji\n Cr Pen...,txtdrakuntansi,https://twitter.com/txtdrakuntansi/status/1644...,1,6,11,False,59697,dr piutang thr dr piutang gaji cr pendapatan j...,from the receivables of the THR receivables fr...
1,2023-04-08 14:52:40+00:00,@islabellecoco @gojekindonesia Sedikit curhat ...,gummypark61,https://twitter.com/gummypark61/status/1644714...,1,1,1,False,14,sedikit curhat bahkan pas korona aja pernah ma...,a little vent even when corona just entered fu...
2,2023-04-08 14:45:03+00:00,rep dibawah sini yg mau spay thr receh 1.000 u...,yufada_,https://twitter.com/yufada_/status/16447129669...,26,1,1,False,113,rep dibawah sini yg mau spay thr receh 1000 un...,Rep below are those who want to spay THR DREH ...
3,2023-04-08 13:39:44+00:00,Selamat kepada :\n@emirahay82\n@0M_YANT0\n@Mis...,mindaart,https://twitter.com/mindaart/status/1644696530...,12,1,5,False,1551,selamat kepada pemenang ga thr masing2 100 spa...,Congratulations to the winner of GA THR each 1...
4,2023-04-08 12:51:00+00:00,ak supres pake link thr sama daget yh yg ke 4,haelovelychan,https://twitter.com/haelovelychan/status/16446...,18,2,35,False,57379,ak supres pake link thr sama daget yh yg ke 4,A suppress packed link


In [34]:
df_raw.to_csv("../data/raw/tweets.csv", index=False)