In [None]:
!pip install ydata-profiling

In [None]:
!pip install indoNLP

In [None]:
!pip install unidecode

In [None]:
import numpy as np
import pandas as pd

import re
from ydata_profiling import ProfileReport

from unidecode import unidecode
import unicodedata

from indoNLP.preprocessing import replace_slang, replace_word_elongation, emoji_to_words

In [None]:
df = pd.read_csv('dataset.csv', encoding='utf-8')
df.head()

# Check NaN Rows

In [None]:
def is_nan(text):
    return pd.isna(text)

In [None]:
df_check_nan = df.copy()
df_check_nan['is_nan'] = df_check_nan['content'].apply(is_nan)

df_check_nan[df_check_nan['is_nan'] == True]

# Check Non-ASCII Characters

In [None]:
def check_non_ascii(text):
  patt = re.compile(r"[a-zA-Z0-9]", re.UNICODE)

  if patt.search(text):
    return False

  return True

In [None]:
df_non_ascii = df.copy(deep = True)
df_non_ascii['Non-ASCII'] = df_non_ascii['content'].apply(check_non_ascii)

df_non_ascii[df_non_ascii['Non-ASCII'] == True]

In [None]:
df['content'].iloc[2329]

In [None]:
def normalize_font(text):
  text = unidecode(str(text))
  text = unicodedata.normalize('NFKC', text)
  return text

In [None]:
rows_to_normalize = [370, 2329, 5448, 7097, 12456]
column_to_normalize = 'content'

In [None]:
df.loc[rows_to_normalize, column_to_normalize] = df.loc[rows_to_normalize, column_to_normalize].apply(normalize_font)

In [None]:
df['content'].iloc[2329]

# Full Text Cleaning

In [None]:
def emoji_alias(text):
  temp = emoji_to_words(text, delimiter = (" ", " "))
  return " ".join(word.replace("_", " ") for word in temp.split())

In [None]:
def remove_repetitive_symbols(text):
  return re.sub(r'([^\w\s])\1+', r'\1', text)

In [None]:
def cleaning(text):
  if not isinstance(text, str):
        return text

  text_clean = text.lower()
  text_clean = re.sub(r'\s+', ' ', text_clean)
  text_clean = replace_slang(text_clean)
  text_clean = replace_word_elongation(text_clean)
  text_clean = emoji_alias(text_clean)
  text_clean = remove_repetitive_symbols(text_clean)
  text_clean = text_clean.strip()

  return text_clean

In [None]:
df_cleaned = df.copy(deep=True)
df_cleaned['cleaned_content'] = df_cleaned['content'].apply(cleaning)

In [None]:
print(df['content'].iloc[1140])
print(df_cleaned['cleaned_content'].iloc[1140])

In [None]:
print(df['content'].iloc[10556])
print(df_cleaned['cleaned_content'].iloc[10556])

In [None]:
print(df['content'].iloc[12701])
print(df_cleaned['cleaned_content'].iloc[12701])

# Check Word Occurence + Missed Normalization

In [None]:
import pandas as pd
from collections import Counter
import re

In [None]:
def get_word_counts(df, column):
  all_text = " ".join(df[column].dropna().astype(str)).lower()
  words = re.findall(r'\b\w+\b', all_text)
  word_counts = Counter(words)

  return word_counts

In [None]:
word_counts = get_word_counts(df_cleaned, 'cleaned_content')

In [None]:
df_word_counts = pd.DataFrame(word_counts.items(), columns=['word', 'count']).sort_values(by='count', ascending=False)
df_word_counts.to_csv('word_counts.csv', index=False)

In [None]:
normalization_dict = {
  'enggak': 'tidak',
  'apk': 'aplikasi',
  'good': 'bagus',
  'eror': 'error',
  'kalo': 'kalau',
  'kagak': 'tidak',
  'uninstal': 'uninstall',
  'dl': 'dulu',
  'apps': 'aplikasi',
  'n': 'dan',
  'tije': 'transjakarta',
  'ticket': 'tiket',
  'pengin': 'ingin',
  'muter': 'putar',
  'apl': 'aplikasi',
  'plis': 'tolong',
  'ful': 'penuh'
}

In [None]:
def normalize_words(text, norm_dict):
  for k, v in norm_dict.items():
    text = re.sub(r'\b' + re.escape(k) + r'\b', v, text)
  return text

In [None]:
df_cleaned['cleaned_content'] = df_cleaned['cleaned_content'].apply(lambda x: normalize_words(x, normalization_dict))

In [None]:
print(df['content'].iloc[1273])
print(df_cleaned['cleaned_content'].iloc[10556])

In [None]:
print(df['content'].iloc[12701])
print(df_cleaned['cleaned_content'].iloc[12701])

# Check Emoji

In [None]:
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F"
    "\U0001F300-\U0001F5FF"
    "\U0001F680-\U0001F6FF"
    "\U0001F1E0-\U0001F1FF"  # flags
    "\U00002700-\U000027BF"  # dingbats
    "\U0001F900-\U0001F9FF"  # supplemental symbols
    "\U00002600-\U000026FF"  # misc symbols
    "]+"
)

df_emoji = df[df['content'].str.contains(emoji_pattern, na=False)]
df_emoji

# Labeled Sentiment

In [None]:
df_cleaned['sentiment'] = df['score'].apply(
    lambda x:
    'negative' if x < 3
    else 'neutral' if x == 3
    else 'positive'
)

# Save to .csv

In [None]:
df_cleaned.to_csv('emoji_words.csv', index = False)