# Count words in each period of time

Before election date, on election date, after election date.

In [None]:
from typing import List

import pickle
import datetime
import pandas as pd
import re
import string
from collections import Counter
from pythainlp import Tokenizer
from pythainlp.corpus import thai_words, thai_stopwords
from pythainlp import thai_digits, thai_punctuations, thai_vowels, thai_tonemarks, thai_symbols, thai_signs

##############

msg_file_name = "edang-2019-01-21-2019-05-25.xlsx"
msg_sheet_name = "twitter"
msg_sheet_before = "before"
msg_sheet_during = "during"
msg_sheet_after = "after"
msg_column = "Message"
msg_datetime_column = "Post time"

keywords_file_name = "edang-keywords.txt"  # to add to tokenizer's dictionary
additional_keywords = {
    "#พลังประชารัฐ",
    "#อนาคตใหม่"
}

#############

# datetime format from Zocial Eye Excel export
datetime_format = '%Y-%m-%d %H:%M:%S'  # 2019-02-15 15:50:56

# the election day
election_date = datetime.datetime(2019, 3, 24)

# two day before election day announcement (2019-1-23)
# = 62 days before the election day
start_date = datetime.datetime(2019, 1, 21) 

# 62 days after the election day
end_date = datetime.datetime(2019, 5, 25)

#############

# if a token only has these characters, not considered it as a word
NON_WORD_CHARS = string.whitespace + string.punctuation + string.digits + thai_digits + thai_punctuations + thai_vowels + thai_tonemarks + thai_symbols + thai_signs

# Words to be used as replacement
# (normalization for better classification, we hope)
REPLACE_LINK = " NNLINK "
REPLACE_EMAIL = " NNEMAIL "
REPLACE_HAHA = " NNHAHA "


# <tag>, http://, www., .php, @mention, mail@address.com, hahaha, 555, 1234
# To be normalized
RE_HTTP_WWW = re.compile(r"(?:\b\S{3,}:\/{1,}\S*)|(?:[wW]{2,}\.\S+)")
RE_EXT = re.compile(
    r"\w+\.(html|htm|shtm|shtml|cgi|php|php3|asp|aspx|cfm|cfml|jsp|png|gif|jpg|java|class|webp|mp3|mp4|mov|pl|do)(\?\S*)?\b",
    flags=re.IGNORECASE,
)
RE_EMAIL = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b")
RE_HAHA = re.compile(r"\b(?:ha\s*){2,}|\u0E16{3,}|5{3,}(?!.\d)\b", flags=re.IGNORECASE)


# Duplicated characters: aaaa, ! ! ! !, ะะะะะ
RE_DUP_C_C = re.compile(r"(?:^|\s)(\S{1,5})(\s+\1)+")  # duplicated isolated chars/words
RE_DUP_CC = re.compile(r"(\D{2,})\1{3,}")  # duplicated non-digits
RE_DUP_C6 = re.compile(r"(\D)\1{5,}")  # duplicated non-digits (six or more characters)
RE_DUP_THAI = re.compile(
    r"([\u0E2F-\u0E3A\u0E3F\u0E40-\u0E4F\?\!])\1+"
)  # Thai vowels/symbols


def normalize_link(text: str, place_holder: str = REPLACE_LINK) -> str:
    text = RE_HTTP_WWW.sub(place_holder, text)  # http, https, mailto, www.
    text = RE_EXT.sub(place_holder, text)  # .html, php3, .jpg
    return text


def normalize_email(text: str, place_holder: str = REPLACE_EMAIL) -> str:
    text = RE_EMAIL.sub(place_holder, text)  # mail@address.com
    return text


def normalize_haha(text: str, place_holder: str = REPLACE_HAHA) -> str:
    text = RE_HAHA.sub(place_holder, text)
    return text


def remove_dup_chars(text: str) -> str:
    """
    Replace duplicated characters
    e.g.
    righttttt -> rightttt
    มรรรรรค -> มรรรรค
    มาาาาาาาาาาาาาาาาาาาาาาากกกก -> มากกกก
    อิอิอิอิอิอิ -> อิอิอิอิ
    ! ! ! ! -> !
    """
    text = RE_DUP_C_C.sub(r"\1", text)
    text = RE_DUP_CC.sub(r"\1\1", text)
    text = RE_DUP_C6.sub(r"\1\1\1\1\1", text)
    text = RE_DUP_THAI.sub(r"\1", text)
    return text


def normalize_text(text: str) -> str:
    text = text.lower()
    text = normalize_link(text)
    text = normalize_email(text)
    text = remove_dup_chars(text)
    text = normalize_haha(text)
    return text


def is_word(word: str) -> bool:
    if not word:
        return False

    for ch in word:
        if ch not in NON_WORD_CHARS:
            return True
    return False


def combine_hashtag(in_tokens: List[str]) -> List[str]:
    out_tokens = []

    in_tokens_len = len(in_tokens)
    i = 0
    while i < in_tokens_len:
        if in_tokens[i].lstrip() != "#":
            out_tokens.append(in_tokens[i])
        else:
            hashtag = ""
            while i+1 < in_tokens_len and in_tokens[i+1].strip() != "" and in_tokens[i+1].lstrip() != "#" and in_tokens[i+1].lstrip()[0] not in string.punctuation:
                hashtag += in_tokens[i+1]
                i = i+1
            out_tokens.append("#" + hashtag.strip())
        i = i+1

    return out_tokens


keywords = set()
with open(keywords_file_name, "r") as file:
    for line in file.readlines():
        keywords.add(line.strip())    
print(keywords)
wordlist = keywords.union(thai_words()).union(additional_keywords)

tokenizer = Tokenizer(custom_dict=wordlist)
del wordlist

# word by day DataFrame
dti = pd.date_range(start=start_date, end=end_date, freq='D')
word_by_day = pd.DataFrame(index=dti, columns=[], data=0)
word_by_day.index.name = "Date"
word_by_day.columns.name = "Word"
word_by_day.head()

election_date

In [None]:
msg_df = pd.read_excel(msg_file_name, sheet_name=msg_sheet_name, usecols=[msg_column, msg_datetime_column])

# Use only date part, discards time
msg_df[msg_datetime_column] = pd.to_datetime(msg_df[msg_datetime_column]).dt.normalize()
msg_df.head()

In [None]:
len(msg_df[msg_df["Post time"] == election_date])

In [None]:
len(msg_df[msg_df["Post time"] < election_date])

In [None]:
len(msg_df[msg_df["Post time"] > election_date])

In [None]:
election_date_df = msg_df[msg_df["Post time"] == election_date]

stopwords = thai_stopwords()
for index, row in election_date_df.iterrows():
    msg = row[msg_column]
    msg = normalize_text(msg)
    words = combine_hashtag(tokenizer.word_tokenize(msg))
    for word in words:
        word = str(word.strip())
        if is_word(word) and word not in stopwords:
            if word not in word_by_day.columns:
                word_by_day[word] = 0
            _date = row[msg_datetime_column]
            if _date not in word_by_day.index:
                word_by_day.loc[_date] = 0
            word_by_day.loc[_date][word] += 1

word_by_day

In [None]:
with open("word-by-day-on-election-date.pkl", "wb") as file:
    pickle.dump(word_by_day, file)

In [None]:
before_election_date_df = msg_df[msg_df["Post time"] < election_date]

for index, row in before_election_date_df.iterrows():
    msg = row[msg_column]
    msg = normalize_text(msg)
    words = combine_hashtag(tokenizer.word_tokenize(msg))
    for word in words:
        word = str(word.strip())
        if is_word(word) and word not in stopwords:
            if word not in word_by_day.columns:
                word_by_day[word] = 0
            _date = row[msg_datetime_column]
            if _date not in word_by_day.index:
                word_by_day.loc[_date] = 0
            word_by_day.loc[_date][word] += 1

word_by_day

In [None]:
with open("word-by-day-before-election-date.pkl", "wb") as file:
    pickle.dump(word_by_day, file)

In [None]:
after_election_date_df = msg_df[msg_df["Post time"] > election_date]

for index, row in after_election_date_df.iterrows():
    msg = row[msg_column]
    msg = normalize_text(msg)
    words = combine_hashtag(tokenizer.word_tokenize(msg))
    for word in words:
        word = str(word.strip())
        if is_word(word) and word not in stopwords:
            if word not in word_by_day.columns:
                word_by_day[word] = 0
            _date = row[msg_datetime_column]
            if _date not in word_by_day.index:
                word_by_day.loc[_date] = 0
            word_by_day.loc[_date][word] += 1

word_by_day

In [None]:
with open("word-by-day-after-election-date.pkl", "wb") as file:
    pickle.dump(word_by_day, file)

In [None]:
for index, item in word_by_day.iteritems():
    if item.sum() < 100:
        del word_by_day[item.name]

word_by_day

In [None]:
word_freqs = []

for index, item in word_by_day.iteritems():
    word_freqs.append((item.name, item.sum()))

def getKey(item):
    return item[1]

word_freqs = sorted(word_freqs, key=getKey, reverse=True)