In [1]:
import pandas as pd
import numpy as np
import nltk
import re
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import spacy
spc = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
#read excel file containing list of Indonesian adverb, adjective, acronym, interjection, and swear words
adverb = pd.read_excel(path_of_adverb_list)
adjective = pd.read_excel(path_of_adjective_list)
acronym = pd.read_excel(path_of_acronym_list)
interjection = pd.read_excel(path_of_interjection_list)
swear_word = pd.read_excel(path_of_swear_word_list)

In [3]:
#convert the values in specific column in each file. the column name can be changed as needed
list_adverb = adverb['adverb'].values.tolist()
list_adjective = adjective['adjective'].values.tolist()
list_acronym = acronym['acronym'].values.tolist()
list_interjection = interjection['Interjection'].values.tolist()
list_swear_word = swear_word["swear word"].values.tolist()

In [4]:
#all of these functions is used to extract the hyperbole and swear words feature based on extraction features rules in the article

def adverb_indonesia(text):
  for token in nltk.word_tokenize(text):
    if token in list_adverb:
      return 1
  else:
    return 0

def adjective_indonesia(text):
  for token in nltk.word_tokenize(text):
    if token in list_adjective:
      return 1
  else:
    return 0

def interjection_indonesia(text):
  for token in nltk.word_tokenize(text):
    if token in list_interjection:
      return 1
  else:
    return 0

def adverb_spacy(text):
  introduction_text = str(text)
  introduction_doc = spc(introduction_text)
  for token in introduction_doc:
    if token.pos_ == "ADV":
      return 1
  else:
    return 0

def adjective_spacy(text):
  introduction_text = str(text)
  introduction_doc = spc(introduction_text)
  count = 0
  for token in introduction_doc:
    if token.pos_ == "ADJ":
      return 1
  else:
    return 0

def interjection_spacy(text):
  introduction_text = str(text)
  introduction_doc = spc(introduction_text)
  #print ([token.text for token in introduction_doc])
  count = 0
  for token in introduction_doc:
    if token.pos_ == "INTJ":
      return 1
  else:
    return 0

def punctuation_mark(text):
  for token in nltk.word_tokenize(str(text)):
    if token == '?' or token == '!':
      return 1
  return 0

def capital_letter(text):
  for token in nltk.word_tokenize(str(text)):
    if token.isupper() and len(token) > 1:
      if token in list_acronym:
        continue
      else:
        return 1
  return 0

def elongated_word(text):
  elong = re.compile("([a-zA-Z])\\1{2,}")
  for token in nltk.word_tokenize(str(text)):
    if elong.search(token):
      return 1
  return 0

def swear_word(text):
  for token in nltk.word_tokenize(text):
    if token in list_swear_word:
      return 1
  else:
    return 0

In [5]:
data = pd.read_excel(path_of_dataset) #read dataset

#extract the hyperbole and swear words features. not, the value of extraction is binary 0/1
#extraction of adverbs, adjectives, and interjections with spacy is conducted on translated tweets since spacy is not supported Indonesian
data["adverb indonesia"] = data["preprocessed"].apply(lambda x: adverb_indonesia(x.lower())) 
data["adjective indonesia"] = data["preprocessed"].apply(lambda x: adjective_indonesia(x.lower()))
data["interjection indonesia"] = data["preprocessed"].apply(lambda x: interjection_indonesia(x.lower()))
data["swear word"] = data["preprocessed"].apply(lambda x: swear_word(x.lower()))

data["adverb spacy"] = data["translate preprocessed"].apply(lambda x: adverb_spacy(x.lower()))
data["adjective spacy"] = data["translate preprocessed"].apply(lambda x: adjective_spacy(x.lower()))
data["interjection spacy"] = data["translate preprocessed"].apply(lambda x: interjection_spacy(x.lower()))


data["punctuation"] = data["preprocessed"].apply(lambda x: punctuation_mark(x.lower()))
data["capital letter"] = data["preprocessed"].apply(lambda x: capital_letter(x))
data["elongated word"] = data["preprocessed"].apply(lambda x: elongated_word(x))
data

Unnamed: 0,preprocessed,Class,translate preprocessed,adverb indonesia,adjective indonesia,interjection indonesia,swear word,adverb spacy,adjective spacy,interjection spacy,punctuation,capital letter,elongated word
0,Prediksi BSU BLT BPJS Ketenagakerjaan Cair di ...,not sarcasm,predictions of bsu blt bpjs employment will be...,1,1,0,0,1,1,0,0,1,0
1,Gdp terbesar ? Oke oke tp kok masih banyak BLT...,not sarcasm,biggest gdp ? okay okay but how come there are...,1,1,0,0,1,1,1,1,1,0
2,Koe gelem mangan rak sur? mangan opo? BLT bant...,not sarcasm,do you want to eat rak sur? what do you eat? b...,1,1,0,0,0,1,0,1,0,0
3,Sejak kapan promo presiden pakai baliho dan yg...,sarcasm,Since when did the presidential promo use bill...,1,1,0,0,0,1,0,0,0,0
4,Pajak gak mau bayar giliran ada BLT maju duluan,not sarcasm,"Taxes don't want to pay, their turn has to go ...",1,1,0,0,1,0,0,0,0,0


In [7]:
#extraction of intensifier and and interjection + swear words features
data["intensifier indonesia"] = np.where((data["adverb indonesia"] == 1) | (data["adjective indonesia"] == 1), 1, 0)
data["intensifier spacy"] = np.where((data["adverb spacy"] == 1) | (data["adjective spacy"] == 1), 1, 0)

data["interjection indonesia + swear word"] = np.where((data["interjection indonesia"] == 1) | (data["swear word"] == 1), 1, 0)
data["interjection spacy + swear word"] = np.where((data["interjection spacy"] == 1) | (data["swear word"] == 1), 1, 0)
data

Unnamed: 0,preprocessed,Class,translate preprocessed,adverb indonesia,adjective indonesia,interjection indonesia,swear word,adverb spacy,adjective spacy,interjection spacy,punctuation,capital letter,elongated word,intensifier indonesia,intensifier spacy,interjection indonesia + swear word,interjection spacy + swear word
0,Prediksi BSU BLT BPJS Ketenagakerjaan Cair di ...,not sarcasm,predictions of bsu blt bpjs employment will be...,1,1,0,0,1,1,0,0,1,0,1,1,0,0
1,Gdp terbesar ? Oke oke tp kok masih banyak BLT...,not sarcasm,biggest gdp ? okay okay but how come there are...,1,1,0,0,1,1,1,1,1,0,1,1,0,1
2,Koe gelem mangan rak sur? mangan opo? BLT bant...,not sarcasm,do you want to eat rak sur? what do you eat? b...,1,1,0,0,0,1,0,1,0,0,1,1,0,0
3,Sejak kapan promo presiden pakai baliho dan yg...,sarcasm,Since when did the presidential promo use bill...,1,1,0,0,0,1,0,0,0,0,1,1,0,0
4,Pajak gak mau bayar giliran ada BLT maju duluan,not sarcasm,"Taxes don't want to pay, their turn has to go ...",1,1,0,0,1,0,0,0,0,0,1,1,0,0


In [8]:
data.to_excel(path_to_save_file, index=False) #save data after extraction process