## Data Scraping and Preparation

### Scrap Twitter Data

In [1]:
# !pip install -qq twint
# !pip install -qq sastrawi
# !pip install -qq nltk

<li>Function <em>scrapKeyword(keyword, since=None, save=True)</em></li>
<li>Function <em>scrapUser(username, keyword=None, since=None, save=True)</em></li>

In [2]:
from utils.twint_scrapper import scrapKeyword, scrapUser

# Setup keyword
keyword = 'minyak goreng'

## Choose one of the methods below (comment another)!
# Scrap by keyword
# df1 = scrapKeyword(keyword=keyword, save=True) #comment to disable

# Scrap by user (*without @) and keyword (*if needed)
df2 = scrapUser(username='detikcom', keyword=keyword, save=True) #comment to disable

[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[+] Finished: Successfully collected 70 Tweets from @detikcom.


### Merge Data <em>(if needed)</em>

In [3]:
from utils.twint_scrapper import saveData, mergeData

# List data to merge
data_to_merge = [
                    'data/raw/2022-04-12-minyak goreng-detikcom.csv', 
                    'data/raw/2022-04-12-minyak goreng-kumparan.csv',
                    'data/raw/2022-04-12-minyak goreng-cnnindonesia.csv',
                    'data/raw/2022-04-12-minyak goreng-tribunnews.csv',
                    'data/raw/2022-04-12-minyak goreng-liputan6dotcom.csv',
                    'data/raw/2022-04-12-minyak goreng-kompascom.csv',
                    'data/raw/2022-04-12-minyak goreng-merdekadotcom.csv'
                ]

# Merge data
df_merge = mergeData(data_to_merge)
# Save merged data
saveData(df_merge, "merged_data_minyak_goreng")
df_merge.shape

(651, 38)

### Data Preprocessing
Full Data Preprocessing

In [4]:
import pandas as pd

# Setup Main Dataframe
df = df_merge

# Remove Duplicate
print('Total (before):',len(df))
print('Duplicate:',df.duplicated(subset='id').sum())
df = df.drop_duplicates(['id'])
print('Total (after):',len(df))
df.info()

Total (before): 651
Duplicate: 0
Total (after): 651
<class 'pandas.core.frame.DataFrame'>
Int64Index: 651 entries, 0 to 650
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               651 non-null    int64  
 1   conversation_id  651 non-null    int64  
 2   created_at       651 non-null    float64
 3   date             651 non-null    object 
 4   timezone         651 non-null    int64  
 5   place            0 non-null      float64
 6   tweet            651 non-null    object 
 7   language         651 non-null    object 
 8   hashtags         651 non-null    object 
 9   cashtags         651 non-null    object 
 10  user_id          651 non-null    int64  
 11  user_id_str      651 non-null    int64  
 12  username         651 non-null    object 
 13  name             651 non-null    object 
 14  day              651 non-null    int64  
 15  hour             651 non-null    int64  
 16  link      

In [5]:
# Reformate Date
df['date'] = df.date.astype('datetime64[ns]')

# Remove Unused Columns
df = df[['id','conversation_id','date','timezone','place','tweet','language','hashtags', 'user_id', 'username','name','day','hour','link','retweet','nlikes','nreplies','nretweets','search','user_rt_id','user_rt','retweet_id','retweet_date','reply_to']].copy()

Text/Tweet Data Preprocessing

In [6]:
from utils.preprocess import getMentioned, cleanText, stemmingClear
from tqdm import tqdm
tqdm.pandas()

df['mentioned']   = df.tweet.progress_apply(getMentioned)
df['tweet_clean'] = df.tweet.progress_apply(cleanText)
df['tweet_stem']  = df.tweet_clean.progress_apply(stemmingClear)
df_tweet = df[['id','date','username','tweet','tweet_clean','tweet_stem']].copy()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\calvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 651/651 [00:00<00:00, 295411.87it/s]
100%|██████████| 651/651 [00:00<00:00, 3047.83it/s]
100%|██████████| 651/651 [01:29<00:00,  7.27it/s]


Save Dataset

In [7]:
import os
dir   = r'data/clean/'
if not os.path.exists(dir):
    os.makedirs(dir)
# Save Main Dataset
df.to_csv(f'{dir}{keyword.replace(" ","_")}.csv', encoding='utf-8')

# Save Text Dataset
df_tweet.to_csv(f'{dir}tweet-{keyword.replace(" ","_")}.csv', encoding='utf-8')