In [1]:
import pandas as pd
import json
import nltk

## Load Data

In [2]:
# Reddit Data

# Read the JSON file
with open('data/project-4-at-2023-05-09-06-21-750d776a.json', 'r') as f:
    data = json.load(f)

# Initialize empty lists for columns
id_list = []
body_list = []
sentiment_list = []
language_list = []
vaccine_list = []
date = []

# Loop through the data and extract the required values
for row in data:
    sentiment = None
    language = None
    vaccine = None
    for annotation in row['annotations'][0]['result']:
        if annotation['from_name'] == 'sentiment':
            sentiment = annotation['value']['choices'][0]
        elif annotation['from_name'] == 'language':
            language = annotation['value']['choices'][0]
        elif annotation['from_name'] == 'vaccine':
            vaccine = annotation['value']['choices'][0]
    
    # Check if any of the sentiment, language, or vaccine annotations are missing
    if sentiment is None or language is None or vaccine is None:
        continue
    
    id_list.append(row['id'])
    body_list.append(row['data']['body'])
    date.append(row['data']['created'])
    sentiment_list.append(sentiment)
    language_list.append(language)
    vaccine_list.append(vaccine)

# Create the DataFrame
df1 = pd.DataFrame({
    'id': id_list,
    'tweet': body_list,
    'sentiment': sentiment_list,
    'language': language_list,
    'vaccine': vaccine_list,
    'date': date
})

df1['date'] = pd.to_datetime(df1['date'], unit='s')

In [3]:
# Read the JSON file
with open('data/project-3-at-2023-04-29-16-34-f2524c9c.json', 'r') as f:
    data = json.load(f)

# Initialize empty lists for columns
id_list = []
body_list = []
sentiment_list = []
language_list = []
vaccine_list = []
date = []

# Loop through the data and extract the required values
for row in data:
    sentiment = None
    language = None
    vaccine = None
    for annotation in row['annotations'][0]['result']:
        if annotation['from_name'] == 'sentiment':
            sentiment = annotation['value']['choices'][0]
        elif annotation['from_name'] == 'language':
            language = annotation['value']['choices'][0]
        elif annotation['from_name'] == 'vaccine':
            vaccine = annotation['value']['choices'][0]
    
    # Check if any of the sentiment, language, or vaccine annotations are missing
    if sentiment is None or language is None or vaccine is None:
        continue
    
    id_list.append(row['data']['id'])
    body_list.append(row['data']['Tweets'])
    date.append(row['data']['date'])
    sentiment_list.append(sentiment)
    language_list.append(language)
    vaccine_list.append(vaccine)

# Create the DataFrame
df2 = pd.DataFrame({
    'id': id_list,
    'tweet': body_list,
    'sentiment': sentiment_list,
    'language': language_list,
    'vaccine': vaccine_list,
    'date' : date
})

df2['date'] = pd.to_datetime(df2['date'], format='%Y-%m-%d %H:%M:%S+00:00').dt.strftime('%Y-%m-%d %H:%M:%S')

In [4]:
# Add source column
df1['source'] = 'Reddit'
df2['source'] = 'Twitter'

# Concatenate the two DataFrames
df = pd.concat([df1, df2], ignore_index=True)

# Remove duplicate tweets
df = df.drop_duplicates(subset=['tweet'])
df.shape

(8451, 7)

## Data Cleaning

In [7]:
# lowercase the tweets
df['tweet'] = df['tweet'].str.lower()
# remove numbers
df['tweet'] = df['tweet'].str.replace(r'\d+', '', regex=True)
# remove punctuation
df['tweet'] = df['tweet'].str.replace(r'[^\w\s]', '', regex=True)

### Tokenization

In [8]:
tokenizer = nltk.tokenize.TweetTokenizer()

# Tokenize the tweets
df['tokenized_tweet'] = df['tweet'].apply(tokenizer.tokenize)
df['tokenized_tweet'].head()

0    [me, personally, ill, probably, be, taking, a,...
1    [hello, im, one, of, the, people, whos, very, ...
2    [hello, im, one, of, the, people, whos, very, ...
3    [thanks, for, sharing, will, share, my, experi...
4    [may, published, phase, kaso, the, thing, is, ...
Name: tokenized_tweet, dtype: object

### Stemming

In [8]:
from cebstemmer import stemmer
from simplemma import text_lemmatizer
from simplemma.langdetect import lang_detector

In [9]:
# precompute the skip words
SKIP_WORDS = set(["vaccine", "covid-19", "moderna", "sinovac", "astrazeneca", "pfizer", "sputnik", "coronavirus", "covid", "covid19", "covid-19", "covidvaccine", "vaccines", "vaccination", "vaccinated"])

# dictionary to map language names to corresponding stemmer or lemmatizer
STEMMERS = {
    'Cebuano': stemmer.stem_word,
    'Tagalog': lambda word: text_lemmatizer(word, lang='tl'),
    'English': lambda word: text_lemmatizer(word, lang='en')
}

def stem_words_by_language(tok, lang, skip_words=SKIP_WORDS):
    stemmer_func = STEMMERS.get(lang)
    
    if stemmer_func:
        newtok = [stemmer_func(i) if i not in skip_words else i for i in tok]
    else:
        # if language is not detected, use list comprehension instead of for loop
        newtok = [text_lemmatizer(i, lang='tl') if i not in skip_words and lang_detector(i, lang=('tl', 'en'))[0][0] == 'tl' 
                  else text_lemmatizer(i, lang='en') if i not in skip_words and lang_detector(i, lang=('tl', 'en'))[0][0] == 'en' 
                  else i for i in tok]
    
    return newtok

In [11]:
# stem the words
# df['stemmed_tweets'] =  df.apply(lambda x: stem_words_by_language(x['tokenized_tweet'], x['language']), axis=1)

def fix_tokens(row):
    fixed_tokens = []
    for token in row:
        if isinstance(token, list):
            if len(token) > 0:
                fixed_tokens.append(token[0])
            else:
                # skip empty lists
                continue
        else:
            fixed_tokens.append(token)
    return fixed_tokens

# df['stemmed_tweets'] = df['stemmed_tweets'].apply(fix_tokens)

In [13]:
import string
import emoji

def remove_unwanted_tokens(row):
    new_row = []
    for token in row:
        if token is not None:
            if emoji.emoji_count(token) > 0:
                new_row.append(token)
            # if punctuation, skip
            elif token in string.punctuation:
                continue
            # if single character, skip
            elif len(token) == 1:
                continue
            else:
                new_row.append(token)
    return new_row
            

# df['stemmed_tweets'] = df['stemmed_tweets'].apply(remove_unwanted_tokens)
# df

### Stopwords
 
Source: [Github link](https://github.com/eanunez/stop_words)

In [9]:
# read cebuano stopwords
from stopwords import CEBUANO_STOP_WORDS, ENGLISH_STOP_WORDS, TAGALOG_STOP_WORDS

# Remove stopwords
def remove_stopwords(tokens, lang):
    if lang == 'Cebuano':
        stopwords = CEBUANO_STOP_WORDS |  ENGLISH_STOP_WORDS
    elif lang == 'Tagalog':
        stopwords = TAGALOG_STOP_WORDS | ENGLISH_STOP_WORDS
    elif lang == 'English':
        stopwords = ENGLISH_STOP_WORDS
    elif lang == 'Taglish':
        stopwords = TAGALOG_STOP_WORDS | ENGLISH_STOP_WORDS
    else:
        return tokens
    
    # add 'yung' to stopwords
    stopwords = stopwords | frozenset(['yung', 'yun', 'lang'])

    new_tokens = []
    for token in tokens:
        if token not in stopwords:
            new_tokens.append(token)
    return new_tokens

In [10]:
# df['no_stopwords_tweet'] = df.apply(lambda x: remove_stopwords(x['stemmed_tweets'], x['language']), axis=1)

df['no_stopwords_tweet'] = df.apply(lambda x: remove_stopwords(x['tokenized_tweet'], x['language']), axis=1)
# Remove blank tokens
def remove_blank_tokens(tokens):
    new_tokens = []
    for token in tokens:
        if token != '':
            new_tokens.append(token)
    return new_tokens

df['no_stopwords_tweet'] = df['no_stopwords_tweet'].apply(remove_blank_tokens)

In [11]:
# Remove rows that contains only a single token in no_stopwords_tweet
df = df[df['no_stopwords_tweet'].map(len) > 1]

# convert list of words into sentences
df['text'] = df['no_stopwords_tweet'].map(lambda x: ' '.join(x))

In [12]:
# save preprocessed data
df.to_csv('data/preprocessed_data.csv', index=False)

# save as pickle
df.to_pickle('data/preprocessed_data.pkl')