# Klasifikasi Data Emosi berdasarkan Komparasi Beberapa Leksikon

## Persiapan Data

**Install Packages**

In [None]:
!pip install deep-translator

In [None]:
!pip install swifter

In [None]:
!pip install afinn

**Import Libraries**

In [None]:
import csv
import swifter
import nltk
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from deep_translator import GoogleTranslator
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from scipy.sparse import hstack
from afinn import Afinn
from sklearn.metrics import *

In [None]:
nltk.download('punkt')

**Koneksi Data ke Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

**Import Data CSV**

In [None]:
#covid
#data_df_1 = pd.read_csv('/content/gdrive/MyDrive/Thesis/output/21122021_covid_ekstraksi_variabel_perilaku.csv')
#data_df_2 = pd.read_csv('/content/gdrive/MyDrive/Thesis/dataset/1-7333 after_covid_data.csv')
#data_df_2.loc[data_df_2.Sentiment == 'Positif', 'Sentiment'] = 'Positive'

#noncovid
data_df_1 = pd.read_csv('/content/gdrive/MyDrive/Thesis/output/21122021_noncovid_ekstraksi_variabel_perilaku.csv')
data_df_2 = pd.read_csv('/content/gdrive/MyDrive/Thesis/dataset/7334-30980 before_covid_data.csv')
data_df_2.loc[data_df_2.Sentiment == 'positif', 'Sentiment'] = 'Positif'
data_df_2.loc[data_df_2.Sentiment == 'negative', 'Sentiment'] = 'Negative'
data_df_2.loc[data_df_2.Sentiment == 'Positif', 'Sentiment'] = 'Positive'


data_df = pd.DataFrame()
data_df['username'] = data_df_1['username'].copy()
data_df['tweet_char'] = data_df_1['tweet_char'].copy()
data_df['tweet_tokens_final'] = data_df_1['tweet_tokens_final'].copy()
data_df['Emotion'] = data_df_2['Emotion'].copy()
data_df['Sentiment'] = data_df_2['Sentiment'].copy()

data_df['tweet_tokens_final'] = data_df['tweet_tokens_final'].str.replace("[","")
data_df['tweet_tokens_final'] = data_df['tweet_tokens_final'].str.replace("]","")
data_df['tweet_tokens_final'] = data_df['tweet_tokens_final'].str.replace("'","")
data_df['tweet_tokens_final'] = data_df['tweet_tokens_final'].str.replace(" ","")
data_df['tweet_tokens_final'] = data_df['tweet_tokens_final'].str.split(",")
data_df['tweet_tokens_final'] = [i for i in data_df['tweet_tokens_final'] if i != '']

Menghitung jumlah data setiap sentimen (Positive, Neutral, dan Negative)

In [None]:
data_df['Sentiment'].value_counts()

In [None]:
#from sklearn.model_selection import StratifiedShuffleSplit

#selected_data_df = data_df[['tweet', 'emotion']].copy()

#X_train, X_test= train_test_split(selected_data_df, test_size=50, random_state = 99,
#                                          stratify=selected_data_df['emotion'])

#X_test.to_csv('/content/gdrive/MyDrive/Thesis/covid_stratified.csv')

In [None]:
#X_test['emotion'].value_counts()

In [None]:
#data_df

## Pemrosesan Data

### Proses Ekstraksi Fitur Unigram dan Bigram dengan TF-IDF

Melakukan split data training dan testing dengan perbandingan  80%:20%

In [None]:
data_df = data_df[(data_df['Sentiment'] != 'Neutral')]
X = data_df['tweet_char']
y = data_df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Ekstraksi fitur Unigram

In [None]:
unigram_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=100000)
unigram_vectorizer.fit(data_df['tweet_char'])
train_unigram_features = unigram_vectorizer.transform(X_train)
test_unigram_features = unigram_vectorizer.transform(X_test)

Ekstraksi fitur Bigram

In [None]:
bigram_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=100000)
bigram_vectorizer.fit(data_df['tweet_char'])
train_bigram_features = bigram_vectorizer.transform(X_train)
test_bigram_features = bigram_vectorizer.transform(X_test)

Menggabungkan hasil ekstraksi fitur unigram dan bigram

In [None]:
train_features = hstack([train_unigram_features, train_bigram_features])
test_features = hstack([test_unigram_features, test_bigram_features])

In [None]:
values, counts = np.unique(y_test, return_counts=True)
values
counts

### Proses Ekstraksi Fitur Unigram dan Bigram dengan TF

Melakukan split data training dan testing dengan perbandingan  80%:20%

In [None]:
data_df = data_df[(data_df['Sentiment'] != 'Neutral')]
X = data_df['tweet_char']
y = data_df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Ekstraksi fitur Unigram

In [None]:
unigram_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=100000)
unigram_vectorizer.fit(data_df['tweet_char'])
train_unigram_features = unigram_vectorizer.transform(X_train)
test_unigram_features = unigram_vectorizer.transform(X_test)

Ekstraksi fitur Bigram

In [None]:
bigram_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=100000)
bigram_vectorizer.fit(data_df['tweet_char'])
train_bigram_features = bigram_vectorizer.transform(X_train)
test_bigram_features = bigram_vectorizer.transform(X_test)

Menggabungkan hasil ekstraksi fitur unigram dan bigram

In [None]:
train_features = hstack([train_unigram_features, train_bigram_features])
test_features = hstack([test_unigram_features, test_bigram_features])

In [None]:
values, counts = np.unique(y_test, return_counts=True)
values
counts

### Proses Ekstraksi Fitur dengan Inset Lexicon

Menentukan label tweet dengan InSet Lexicon

In [None]:
lexicon_positive = dict()
positive = pd.read_csv('/content/gdrive/MyDrive/Thesis/resources/inSet Lexicon/positive.tsv', sep = '\t')

for i, row in positive.iterrows():
  lexicon_positive[row['word']] = int(row['weight'])

lexicon_negative = dict()
negative = pd.read_csv('/content/gdrive/MyDrive/Thesis/resources/inSet Lexicon/negative.tsv', sep = '\t')

for i, row in negative.iterrows():
  lexicon_negative[row['word']] = int(row['weight'])
        
# Function to determine sentiment polarity of tweets        
def sentiment_analysis_lexicon_indonesia(text):
    #cleaning text format
    #text = text.replace("[", "")
    #text = text.replace("]", "")
    #text = text.replace("'", "")
    #text = text.replace(" ", "")
    #text = list(text.split(' '))
    #text = [i for i in text if i != '']

    #for word in text:
    score = 0
    for word in text:
        if (word in lexicon_positive):
            score = score + lexicon_positive[word]
    for word in text:
        if (word in lexicon_negative):
            score = score + lexicon_negative[word]
    polarity=''
    if (score > 0):
        polarity = 'Positive'
    elif (score < 0):
        polarity = 'Negative'
    else:
        polarity = 'Neutral'
    return score, polarity

In [None]:
# Results from determine sentiment polarity of tweets

results = data_df['tweet_tokens_final'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
data_df['Polarity_Score'] = results[0]
data_df['Polarity'] = results[1]
print(data_df['Polarity'].value_counts())

In [None]:
neg0, pos0 = (data_df['Sentiment'][data_df['Sentiment'] == 'Negative']).count(), (data_df['Sentiment'][data_df['Sentiment'] == 'Positive']).count()
neg1, pos1 = (data_df['Polarity'][data_df['Polarity'] == 'Negative']).count(), (data_df['Polarity'][data_df['Polarity'] == 'Positive']).count()

print('neg:', neg0, '(', '{0:.2f}'.format(neg0/(neg0+pos0)*100), '%)','\t', 'pos:', pos0, '(', '{0:.2f}'.format(pos0/(neg0+pos0)*100),'%)',' | actual label')
print('neg:', neg1, '(', '{0:.2f}'.format(neg1/(neg1+pos1)*100), '%)','\t', 'pos:', pos1, '(', '{0:.2f}'.format(pos1/(neg1+pos1)*100),'%)',' | inset')

Melakukan tokenisasi

In [None]:
#Step - a : Menghapus baris kosong, jika ada.
#data_df['tweet_char'].dropna(inplace=True)
# # Step - b : Mengganti semua teks ke karakter kecil karena 'oke' dan 'OKE' diinterpretasikan berbeda
# Corpus['text'] = [entry.lower() for entry in Corpus['text']] # we've done this in '[1] text cleaning.ipynb'
# Step - c : Tokenisasi : Setiap kalimat di dalam korpus akan dipecah menjadi daftar kata/string
#data_df['tweet_char']= [word_tokenize(entry) for entry in data_df['tweet_char']]


for index, entry in enumerate(data_df['tweet_tokens_final']):
    # Mendeklarasikan list kosong untuk menyimpan daftar kata yang sesuai dengan aturan yang dibuat
    Final_words = []
    for word in entry:
        # Kondisi di bawah adalah untuk mengecek/mempertimbangkan alfabet saja
        if word.isalpha():
            word_Final = word
            Final_words.append(word_Final)
    data_df.loc[index,'tweet_final'] = str(Final_words)

Melakukan split data training dan testing dengan perbandingan  80%:20%

In [None]:
data_df = data_df[(data_df['Sentiment'] != 'Neutral')]
X = data_df['tweet_final']
y = data_df['Sentiment']

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train_inset, X_test_inset, y_train_inset, y_test_inset = train_test_split(data_df['tweet_final'], data_df['Polarity'], test_size=0.2, random_state=42)
y_train, y_test = train_test_split(data_df['Sentiment'], test_size=0.2, random_state=42)

In [None]:
print(X_train_inset.size, X_train_inset.size/(X_test_inset.size + X_train_inset.size),'%','\n',
      X_test_inset.size, X_test_inset.size/(X_test_inset.size + X_train_inset.size),'%')

In [None]:
# Encoding label menjadi nilai antara 0 and kelas_n-1
Encoder = LabelEncoder()
y_train_inset = Encoder.fit_transform(y_train_inset)
y_test_inset = Encoder.fit_transform(y_test_inset)
#y_train = Encoder.fit_transform(y_train)
#y_test = Encoder.fit_transform(y_test)

**Ekstraksi Fitur Inset Lexicon dengan TF-IDF**

In [None]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(data_df['tweet_final'])

X = Tfidf_vect.fit_transform(data_df['tweet_final'])

# Transform Train_X dan Test_X ke vektor TF-IDF
train_features = Tfidf_vect.transform(X_train_inset)
test_features = Tfidf_vect.transform(X_test_inset)

### Proses Translasi Data Indonesia - Inggris

In [None]:
def token_translated(word):
  try:
    word_translated = GoogleTranslator(source='id', target='en').translate(word)
  except:
    word_translated = word

  return word_translated.lower()

def text_translated(text):

    text_translated = []
    for word in text:
      word_token = token_translated(word) 
      text_translated.append(word_token)
    
    return text_translated

In [None]:
#data_df_translated = data_df['tweet_tokens_final'].swifter.apply(text_translated)
#data_translated = pd.DataFrame()
#data_translated['tweet_tokens_final'] = data_df['tweet_tokens_final'].copy()
#data_translated['tweet_tokens_translated'] = data_df_translated.copy()
#data_translated.to_csv('/content/gdrive/MyDrive/Thesis/resources/before_covid_tweet_translated.csv')

In [None]:
data_translated = pd.read_csv('/content/gdrive/MyDrive/Thesis/resources/before_covid_tweet_translated.csv')

### Proses Ekstraksi Fitur dengan SentiWordNet Lexicon

Membuat function untuk analsis sentimen

In [None]:
sentiwordnet_path = "/content/gdrive/MyDrive/Thesis/resources/SentiWordNet Lexicon/SentiWordNet_3.0.0.txt"

In [None]:
def split_line(line):
    cols = line.split("\t")
    return cols

def get_words(cols):
    words_ids = cols[4].split(" ")
    words = [w.split("#")[0] for w in words_ids]
    return words

def get_positive(cols):
    return cols[2]

def get_negative(cols):
    return cols[3]

def get_objective(cols):
    return 1 - (float(cols[2]) + float(cols[3]))

def get_gloss(cols):
    return cols[5]

def get_scores_sentiwordnet(sentiword):

    f = open(sentiwordnet_path)
    totalobject =0.0
    count =0.0
    totalpositive =0.0
    totalnegative =0.0
    for line in f:
        if not line.startswith("#"):
            cols = split_line(line)
            words = get_words(cols)
           
            for word in sentiword:
                if word in words:
                    if word == "not":
                        totalobject = totalobject + 0
                        totalpositive = totalpositive + 0
                        totalnegative = totalnegative + 16
                        count =count + 1
                    else:

                        totalobject = totalobject + get_objective(cols)
                        totalpositive = totalpositive + float(get_positive(cols))
                        totalnegative = totalnegative + float(get_negative(cols))
                        count =count + 1
    
    polarity=''
    score = 0
    if count > 0:
        if totalpositive > totalnegative :
            polarity = 'Positive'
            score = totalpositive
        elif totalpositive < totalnegative :
            polarity = 'Negative'
            score = totalnegative
        else :
            polarity = 'Neutral'
            score = totalpositive

    return score, polarity

Menentukan label tweet dengan InSet Lexicon

In [None]:
#results = data_translated['tweet_tokens_translated'].swifter.apply(get_scores_sentiwordnet)
#sentiwordnet_results = list(zip(*results))
#data_sentiwordnet = pd.DataFrame()
#data_sentiwordnet['tweet_tokens_final'] = data_df['tweet_tokens_final'].copy()
#data_sentiwordnet['tweet_tokens_translated'] = data_translated['tweet_tokens_translated'].copy()
#data_sentiwordnet['Polarity_Score'] = sentiwordnet_results[0]
#data_sentiwordnet['Polarity'] = sentiwordnet_results[1]
#data_sentiwordnet.to_csv('/content/gdrive/MyDrive/Thesis/resources/before_covid_sentiwordnet.csv')

In [None]:
data_sentiwordnet = pd.read_csv('/content/gdrive/MyDrive/Thesis/resources/before_covid_sentiwordnet.csv')
data_df['Polarity_Score'] = data_sentiwordnet['Polarity_Score'].copy()
data_df['Polarity'] = data_sentiwordnet['Polarity'].copy()

In [None]:
neg0, pos0 = (data_df['Sentiment'][data_df['Sentiment'] == 'Negative']).count(), (data_df['Sentiment'][data_df['Sentiment'] == 'Positive']).count()
neg1, pos1 = (data_df['Polarity'][data_df['Polarity'] == 'Negative']).count(), (data_df['Polarity'][data_df['Polarity'] == 'Positive']).count()

print('neg:', neg0, '(', '{0:.2f}'.format(neg0/(neg0+pos0)*100), '%)','\t', 'pos:', pos0, '(', '{0:.2f}'.format(pos0/(neg0+pos0)*100),'%)',' | actual label')
print('neg:', neg1, '(', '{0:.2f}'.format(neg1/(neg1+pos1)*100), '%)','\t', 'pos:', pos1, '(', '{0:.2f}'.format(pos1/(neg1+pos1)*100),'%)',' | sentiwordnet')

Melakukan tokenisasi

In [None]:
#Step - a : Menghapus baris kosong, jika ada.
#data_df['tweet_char'].dropna(inplace=True)
# # Step - b : Mengganti semua teks ke karakter kecil karena 'oke' dan 'OKE' diinterpretasikan berbeda
# Corpus['text'] = [entry.lower() for entry in Corpus['text']] # we've done this in '[1] text cleaning.ipynb'
# Step - c : Tokenisasi : Setiap kalimat di dalam korpus akan dipecah menjadi daftar kata/string
#data_df['tweet_char']= [word_tokenize(entry) for entry in data_df['tweet_char']]


for index, entry in enumerate(data_df['tweet_tokens_final']):
    # Mendeklarasikan list kosong untuk menyimpan daftar kata yang sesuai dengan aturan yang dibuat
    Final_words = []
    for word in entry:
        # Kondisi di bawah adalah untuk mengecek/mempertimbangkan alfabet saja
        if word.isalpha():
            word_Final = word
            Final_words.append(word_Final)
    data_df.loc[index,'tweet_final'] = str(Final_words)

Melakukan split data training dan testing dengan perbandingan  80%:20%

In [None]:
data_df = data_df[(data_df['Sentiment'] != 'Neutral')]
X = data_df['tweet_final']
y = data_df['Sentiment']

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train_inset, X_test_inset, y_train_inset, y_test_inset = train_test_split(data_df['tweet_final'], data_df['Polarity'], test_size=0.2, random_state=42)
y_train, y_test = train_test_split(data_df['Sentiment'], test_size=0.2, random_state=42)

In [None]:
print(X_train_inset.size, X_train_inset.size/(X_test_inset.size + X_train_inset.size),'%','\n',
      X_test_inset.size, X_test_inset.size/(X_test_inset.size + X_train_inset.size),'%')

In [None]:
# Encoding label menjadi nilai antara 0 and kelas_n-1
Encoder = LabelEncoder()
y_train_inset = Encoder.fit_transform(y_train_inset)
y_test_inset = Encoder.fit_transform(y_test_inset)
#y_train = Encoder.fit_transform(y_train)
#y_test = Encoder.fit_transform(y_test)

**Ekstraksi Fitur Inset Lexicon dengan TF-IDF**

In [None]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(data_df['tweet_final'])

X = Tfidf_vect.fit_transform(data_df['tweet_final'])

# Transform Train_X dan Test_X ke vektor TF-IDF
train_features = Tfidf_vect.transform(X_train_inset)
test_features = Tfidf_vect.transform(X_test_inset)

### Proses Ekstraksi Fitur dengan AFINN Lexicon

Membuat function untuk analsis sentimen

In [None]:
def afinn_sentiment(text):
  # compute scores (polarity) and labels
  
  polarity=''
  scores = afn.score(text)
  
  if scores > 0 :
      polarity = 'Positive'
  elif scores < 0 :
      polarity = 'Negative'
  else :
      polarity = 'Neutral'

  return scores, polarity

In [None]:
afn = Afinn()
#results = data_translated['tweet_tokens_translated'].swifter.apply(afinn_sentiment)
#afinn_results = list(zip(*results))
#data_afinn = pd.DataFrame()
#data_afinn['tweet_tokens_final'] = data_df['tweet_tokens_final'].copy()
#data_afinn['tweet_tokens_translated'] = data_translated['tweet_tokens_translated'].copy()
#data_afinn['Polarity_Score'] = afinn_results[0]
#data_afinn['Polarity'] = afinn_results[1]
#data_afinn.to_csv('/content/gdrive/MyDrive/Thesis/resources/before_covid_afinn.csv')

Menentukan label tweet dengan InSet Lexicon

In [None]:
data_afinn = pd.read_csv('/content/gdrive/MyDrive/Thesis/resources/before_covid_afinn.csv')
data_df['Polarity_Score'] = data_afinn['Polarity_Score'].copy()
data_df['Polarity'] = data_afinn['Polarity'].copy()

In [None]:
neg0, pos0 = (data_df['Sentiment'][data_df['Sentiment'] == 'Negative']).count(), (data_df['Sentiment'][data_df['Sentiment'] == 'Positive']).count()
neg1, pos1 = (data_df['Polarity'][data_df['Polarity'] == 'Negative']).count(), (data_df['Polarity'][data_df['Polarity'] == 'Positive']).count()

print('neg:', neg0, '(', '{0:.2f}'.format(neg0/(neg0+pos0)*100), '%)','\t', 'pos:', pos0, '(', '{0:.2f}'.format(pos0/(neg0+pos0)*100),'%)',' | actual label')
print('neg:', neg1, '(', '{0:.2f}'.format(neg1/(neg1+pos1)*100), '%)','\t', 'pos:', pos1, '(', '{0:.2f}'.format(pos1/(neg1+pos1)*100),'%)',' | AFINN Lexicon')

Melakukan tokenisasi

In [None]:
#Step - a : Menghapus baris kosong, jika ada.
#data_df['tweet_char'].dropna(inplace=True)
# # Step - b : Mengganti semua teks ke karakter kecil karena 'oke' dan 'OKE' diinterpretasikan berbeda
# Corpus['text'] = [entry.lower() for entry in Corpus['text']] # we've done this in '[1] text cleaning.ipynb'
# Step - c : Tokenisasi : Setiap kalimat di dalam korpus akan dipecah menjadi daftar kata/string
#data_df['tweet_char']= [word_tokenize(entry) for entry in data_df['tweet_char']]


for index, entry in enumerate(data_df['tweet_tokens_final']):
    # Mendeklarasikan list kosong untuk menyimpan daftar kata yang sesuai dengan aturan yang dibuat
    Final_words = []
    for word in entry:
        # Kondisi di bawah adalah untuk mengecek/mempertimbangkan alfabet saja
        if word.isalpha():
            word_Final = word
            Final_words.append(word_Final)
    data_df.loc[index,'tweet_final'] = str(Final_words)

Melakukan split data training dan testing dengan perbandingan  80%:20%

In [None]:
data_df = data_df[(data_df['Sentiment'] != 'Neutral')]
X = data_df['tweet_final']
y = data_df['Sentiment']

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train_afinn, X_test_afinn, y_train_afinn, y_test_afinn = train_test_split(data_df['tweet_final'], data_df['Polarity'], test_size=0.2, random_state=42)
y_train, y_test = train_test_split(data_df['Sentiment'], test_size=0.2, random_state=42)

In [None]:
print(X_train_afinn.size, X_train_afinn.size/(X_test_afinn.size + X_train_afinn.size),'%','\n',
      X_test_afinn.size, X_test_afinn.size/(X_test_afinn.size + X_train_afinn.size),'%')

In [None]:
# Encoding label menjadi nilai antara 0 and kelas_n-1
Encoder = LabelEncoder()
y_train_afinn = Encoder.fit_transform(y_train_afinn)
y_test_afinn = Encoder.fit_transform(y_test_afinn)
#y_train = Encoder.fit_transform(y_train)
#y_test = Encoder.fit_transform(y_test)

**Ekstraksi Fitur Inset Lexicon dengan TF-IDF**

In [None]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(data_df['tweet_final'])

X = Tfidf_vect.fit_transform(data_df['tweet_final'])

# Transform Train_X dan Test_X ke vektor TF-IDF
train_features = Tfidf_vect.transform(X_train_afinn)
test_features = Tfidf_vect.transform(X_test_afinn)

### Proses Ekstraksi Fitur dengan Liu Lexicon



Membaca file Liu Lexicon

In [None]:
def isNotNull(value):
    return value is not None and len(value) > 0

dict_pos = []
dict_neg = []

f = open('/content/gdrive/MyDrive/Thesis/resources/Liu Lexicon/positive-words.txt', 'r', encoding = "ISO-8859-1")
for line in f:
    t = line.strip().lower();
    if (isNotNull(t)):
        dict_pos.append(t)
f.close()

f = open('/content/gdrive/MyDrive/Thesis/resources/Liu Lexicon/negative-words.txt', 'r', encoding = "ISO-8859-1")
for line in f:
    t= line.strip().lower();
    if (isNotNull(t)):
        dict_neg.append(t)
f.close()

Membuat function untuk analsis sentimen

In [None]:
def liu_sentiment(text):
  
  neg_cnt = 0
  pos_cnt = 0

  for neg in dict_neg:
      if (neg in text):
          neg_cnt = neg_cnt +1
  for pos in dict_pos:
      if (pos in text):
          pos_cnt = pos_cnt +1
  
  analysis_sent = pos_cnt - neg_cnt

  polarity=''
  scores = analysis_sent
  
  if analysis_sent > 0 :
      polarity = 'Positive'
  elif analysis_sent < 0 :
      polarity = 'Negative'
  else :
      polarity = 'Neutral'

  return scores, polarity

In [None]:
#results = data_translated['tweet_tokens_translated'].swifter.apply(liu_sentiment)
#liu_results = list(zip(*results))
#data_liu = pd.DataFrame()
#data_liu['tweet_tokens_final'] = data_df['tweet_tokens_final'].copy()
#data_liu['tweet_tokens_translated'] = data_translated['tweet_tokens_translated'].copy()
#data_liu['Polarity_Score'] = liu_results[0]
#data_liu['Polarity'] = liu_results[1]
#data_liu.to_csv('/content/gdrive/MyDrive/Thesis/resources/before_covid_liu.csv')

Menentukan label tweet dengan InSet Lexicon

In [None]:
data_liu = pd.read_csv('/content/gdrive/MyDrive/Thesis/resources/before_covid_liu.csv')
data_df['Polarity_Score'] = data_liu['Polarity_Score'].copy()
data_df['Polarity'] = data_liu['Polarity'].copy()

In [None]:
neg0, pos0 = (data_df['Sentiment'][data_df['Sentiment'] == 'Negative']).count(), (data_df['Sentiment'][data_df['Sentiment'] == 'Positive']).count()
neg1, pos1 = (data_df['Polarity'][data_df['Polarity'] == 'Negative']).count(), (data_df['Polarity'][data_df['Polarity'] == 'Positive']).count()

print('neg:', neg0, '(', '{0:.2f}'.format(neg0/(neg0+pos0)*100), '%)','\t', 'pos:', pos0, '(', '{0:.2f}'.format(pos0/(neg0+pos0)*100),'%)',' | actual label')
print('neg:', neg1, '(', '{0:.2f}'.format(neg1/(neg1+pos1)*100), '%)','\t', 'pos:', pos1, '(', '{0:.2f}'.format(pos1/(neg1+pos1)*100),'%)',' | Liu Lexicon')

Melakukan tokenisasi

In [None]:
#Step - a : Menghapus baris kosong, jika ada.
#data_df['tweet_char'].dropna(inplace=True)
# # Step - b : Mengganti semua teks ke karakter kecil karena 'oke' dan 'OKE' diinterpretasikan berbeda
# Corpus['text'] = [entry.lower() for entry in Corpus['text']] # we've done this in '[1] text cleaning.ipynb'
# Step - c : Tokenisasi : Setiap kalimat di dalam korpus akan dipecah menjadi daftar kata/string
#data_df['tweet_char']= [word_tokenize(entry) for entry in data_df['tweet_char']]


for index, entry in enumerate(data_df['tweet_tokens_final']):
    # Mendeklarasikan list kosong untuk menyimpan daftar kata yang sesuai dengan aturan yang dibuat
    Final_words = []
    for word in entry:
        # Kondisi di bawah adalah untuk mengecek/mempertimbangkan alfabet saja
        if word.isalpha():
            word_Final = word
            Final_words.append(word_Final)
    data_df.loc[index,'tweet_final'] = str(Final_words)

Melakukan split data training dan testing dengan perbandingan  80%:20%

In [None]:
data_df = data_df[(data_df['Sentiment'] != 'Neutral')]
X = data_df['tweet_final']
y = data_df['Sentiment']

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train_liu, X_test_liu, y_train_liu, y_test_liu = train_test_split(data_df['tweet_final'], data_df['Polarity'], test_size=0.2, random_state=42)
y_train, y_test = train_test_split(data_df['Sentiment'], test_size=0.2, random_state=42)

In [None]:
print(X_train_liu.size, X_train_liu.size/(X_test_liu.size + X_train_liu.size),'%','\n',
      X_test_liu.size, X_test_liu.size/(X_test_liu.size + X_train_liu.size),'%')

In [None]:
# Encoding label menjadi nilai antara 0 and kelas_n-1
Encoder = LabelEncoder()
y_train_liu = Encoder.fit_transform(y_train_liu)
y_test_liu = Encoder.fit_transform(y_test_liu)
#y_train = Encoder.fit_transform(y_train)
#y_test = Encoder.fit_transform(y_test)

**Ekstraksi Fitur Inset Lexicon dengan TF-IDF**

In [None]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(data_df['tweet_final'])

X = Tfidf_vect.fit_transform(data_df['tweet_final'])

# Transform Train_X dan Test_X ke vektor TF-IDF
train_features = Tfidf_vect.transform(X_train_liu)
test_features = Tfidf_vect.transform(X_test_liu)


### Pemodelan dan Evaluasi

Naive Bayes

In [None]:
cNB = GaussianNB()
modelNB = cNB.fit(train_features.toarray(), y_train.tolist())

predictNB = modelNB.predict(test_features.toarray())
print(classification_report(y_test.tolist(), predictNB, labels=['Positive', 'Negative']))

Linear Regression

In [None]:
cLR = LogisticRegression(random_state=77)
modelLR = cLR.fit(train_features, y_train)

predictLR = modelLR.predict(test_features)
print(classification_report(y_test, predictLR, labels=['Positive', 'Negative']))

Linear SVM

In [None]:
cSVM = svm.SVC()
modelSVM = cSVM.fit(train_features, y_train)

predictSVM = modelSVM.predict(test_features)
print(classification_report(y_test, predictSVM, labels=['Positive', 'Negative']))

MLP Classifier

In [None]:
cMLP = MLPClassifier(random_state=1, max_iter=500)
modelMLP = cMLP.fit(train_features, y_train)

predictMLP = modelMLP.predict(test_features)
print(classification_report(y_test, predictMLP, labels=['Positive', 'Negative']))