In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
# Importing library
import numpy as np
import pandas as pd

# BeautifulSoup = hapus tag html
from bs4 import BeautifulSoup 
import re # regular expressions (regex)

# natural language tool kits
from nltk.corpus import stopwords
import nltk

# word2vec library
from gensim.models import word2vec
import itertools

# Packages required for data preparation
from sklearn.model_selection import train_test_split

# library untuk Random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [31]:
# path dataset
data_path = r'/content/drive/MyDrive/Text Dataset/Tweets.csv'
dataset = pd.read_csv(data_path, header=0, sep = ',')

dataset = dataset.reindex(np.random.permutation(dataset.index))  

In [33]:
dataset = dataset[['text', 'airline_sentiment']]
# dataset.text = dataset.text.apply(remove_stopwords).apply(remove_mentions)

dataset.head(10)

Unnamed: 0,text,airline_sentiment
14424,@AmericanAir redirect my flight without tellin...,negative
4108,@united Call customer service and of course th...,negative
11786,@USAirways Haha - that will indeed be a great ...,positive
3801,@united trade show! Come by both 130 for aweso...,neutral
10384,@USAirways Can't stress enough how awful the a...,negative
1412,@united bags left behind because plane overwe...,negative
5417,@SouthwestAir Suggestions: tell customers appr...,negative
986,@united really? Someone called in sick and th...,negative
2577,@united - you delayed our departure by 2 hrs t...,negative
9120,@USAirways well then that is a horrible flaw i...,negative


In [34]:
X_train, X_test, y_train, y_test = train_test_split(dataset.text, dataset.airline_sentiment, test_size = 0.2, random_state = 42)

In [35]:
# proprocessing data teks
def review_wordlist(review, remove_stopwords=False):
    
    # hapus simbol
    review_text = re.sub("[^a-zA-Z]"," ",review)
    
    # hapus @
    review_text = re.sub("([^\s\w]|_@?)+"," ", review_text)

    # konversi ke huruf kecil dan dipisah perkata
    words = review_text.lower().split()

    # menghapus stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return(words)

In [36]:
# download file punctuation and stpwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [37]:
# word2vec expects a list of lists.
# Using punkt tokenizer for better splitting of a paragraph into sentences.

import nltk.data
#nltk.download('popular')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [38]:
# Memisah review per kalimat
def review_sentences(review, tokenizer, remove_stopwords=False):
    
    # melakukan tokenize dengan nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []

    # mengisi array sentences dengan masing - masing review
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence, remove_stopwords))

    # list dari list
    return sentences

In [39]:
sentences = []
print("Parsing sentences from training set")
for review in X_train:
    sentences += review_sentences(review, tokenizer)

Parsing sentences from training set


In [40]:
print("List dari lists. Cek tipe data : ", type(sentences), " of ", type(sentences[0]))
# print(sentences)
print(sentences[105])
print(len(sentences))

List dari lists. Cek tipe data :  <class 'list'>  of  <class 'list'>
['united', 'darquenloveli', 'we', 'regret', 'to', 'hear', 'this']
23490


In [22]:
best_score = []
best_parameter = []

In [44]:
# parameter yang akan dioptimasi
parm_dict = {
    'workers' : (4,),
    'size' : (300, 350, 400, 450, 500, 550, 600),
    'min_count' : (40,),
    'window' : (10,)
}

# melakukan x, dengan metode y, untuk z 
# optimization of feature dimension on word embedding based on vector size using grid search for sentimen analysis

def cust_param_search(parm_dict):

    score_best, parm_best = 0,()
    workers, size, min_count, window = [tup for k,tup in parm_dict.items()] # Individual parm tuples
    parm_combo = list(itertools.product(workers, size, min_count, window)) # Create all combinations
    
    print('\n==============================================================')
    print('PARAMETER')
    print('==============================================================')
    print(f'Total kombinasi parameter   : {len(parm_combo)}')
    print(f'Semua kombinasi parameter   : {parm_combo}')

    # Grid search
    i = 1
    for parms in parm_combo:

        print('\n==============================================================')
        print(f'{i}.\t Parameter : {parms}')
        print('==============================================================')
        
        w, s, m, wi = parms
        
        # =================================================================================
        # word2vec mulai disini
        # =================================================================================
        
        # word2vec model
        model = word2vec.Word2Vec(workers = w, 
                              size = s, 
                              min_count = m, 
                              window = wi)
        model.build_vocab(sentences)

        # training model word2vec (CBOW, karena parameter 'sg' menggunakan nilai default)
        print('\n==============================================================')
        print("Training model Word2Vec ...")
        print('==============================================================')
        model.train(sentences = sentences, total_examples = len(sentences), epochs = 5)

        # Function to average all word vectors in a paragraph
        def featureVecMethod(words, model, num_features):
            # Pre-initialising empty numpy array for speed
            featureVec = np.zeros(s,dtype="float32")
            nwords = 0
            
            #Converting Index2Word which is a list to a set for better speed in the execution.
            index2word_set = set(model.wv.index2word)
            
            for word in  words:
                if word in index2word_set:
                    nwords = nwords + 1
                    featureVec = np.add(featureVec,model[word])
            
            # Dividing the result by number of words to get average
            featureVec = np.divide(featureVec, nwords)
            return featureVec

        # Function for calculating the average feature vector
        def getAvgFeatureVecs(reviews, model, num_features):
            counter = 0
            reviewFeatureVecs = np.zeros((len(reviews),s),dtype="float32")
            for review in reviews:
                # Printing a status message every 1000th review
                if counter%1000 == 0:
                    print("Review %d of %d"%(counter,len(reviews)))
                    
                reviewFeatureVecs[counter] = featureVecMethod(review, model, s)
                counter = counter+1
                
            return reviewFeatureVecs

        # Calculating average feature vector (mendapatkan vektor training set)
        clean_train_reviews = []
        for review in X_train:
            clean_train_reviews.append(review_wordlist(review, remove_stopwords=True))
            
        trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, s)

        # Calculating average feature vactors (mendapatkan vektor test set)     
        clean_test_reviews = []
        for review in X_test:
            clean_test_reviews.append(review_wordlist(review,remove_stopwords=True))
            
        testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, s)

        # tujuan masing-masing model word2vec, untuk mendapatkan 'trainDataVecs' dan 'testDataVecs' 
        # untuk diolah lebih lanjut di classifier
        
        # =================================================================================
        # word2vec berakhir disini
        # =================================================================================

        print('\n==============================================================')
        print('Sentiment Analysis Process ...')
        print('==============================================================')
        # Memanggil classifier
        score, report = classification_model(trainDataVecs, testDataVecs, y_train, y_test)

        # temporary print
        print('\n==============================================================')
        print(f'Hasil Test Parameter ke - {i}')
        print(f'Parameter  => {parms}')
        print(f'Accuracy   => {score}')
        print(f'Classification Report => \n {report}')

        best_score.append(score)
        best_parameter.append(parms)
        
        if score > score_best:
            score_best = score
            parm_best = parms
        
        i = i + 1
    
    print('\n==============================================================')
    print(f'Best Parameter  => {parm_best}')
    print(f'Accuracy   => {score_best}')
    print('==============================================================')

In [42]:
def classification_model(train_feature_vec, test_feature_vec, train_sentiment, test_sentiment):

    # Model/classifier yang dipakai bebas
    # Fit random forest classifier ke data training
    forest = RandomForestClassifier(n_estimators = 100)    
    forest = forest.fit(train_feature_vec, train_sentiment) 

    # Prediksi nilai sentiment untuk test data 
    predicted = forest.predict(test_feature_vec)

    # akurasi
    accuracy = accuracy_score(test_sentiment, predicted)
    report = classification_report(test_sentiment, predicted, digits = 5)
    
    return accuracy, report

In [45]:
# test drive ma men (estimasi running : 21.30)
cust_param_search(parm_dict)


PARAMETER
Total kombinasi parameter   : 7
Semua kombinasi parameter   : [(4, 300, 40, 10), (4, 350, 40, 10), (4, 400, 40, 10), (4, 450, 40, 10), (4, 500, 40, 10), (4, 550, 40, 10), (4, 600, 40, 10)]

1.	 Parameter : (4, 300, 40, 10) 


Training model Word2Vec ...
Review 0 of 11712
Review 1000 of 11712
Review 2000 of 11712




Review 3000 of 11712
Review 4000 of 11712
Review 5000 of 11712
Review 6000 of 11712
Review 7000 of 11712
Review 8000 of 11712
Review 9000 of 11712
Review 10000 of 11712
Review 11000 of 11712
Review 0 of 2928
Review 1000 of 2928
Review 2000 of 2928

Sentiment Analysis Process ...

Hasil Test Parameter ke - 1
Parameter  => (4, 300, 40, 10)
Accuracy   => 0.719603825136612
Classification Report => 
               precision    recall  f1-score   support

    negative    0.75112   0.91608   0.82544      1835
     neutral    0.55714   0.37440   0.44785       625
    positive    0.71111   0.41026   0.52033       468

    accuracy                        0.71960      2928
   macro avg    0.67312   0.56691   0.59787      2928
weighted avg    0.70332   0.71960   0.69607      2928


2.	 Parameter : (4, 350, 40, 10) 


Training model Word2Vec ...
Review 0 of 11712
Review 1000 of 11712
Review 2000 of 11712
Review 3000 of 11712




Review 4000 of 11712
Review 5000 of 11712
Review 6000 of 11712
Review 7000 of 11712
Review 8000 of 11712
Review 9000 of 11712
Review 10000 of 11712
Review 11000 of 11712
Review 0 of 2928
Review 1000 of 2928
Review 2000 of 2928

Sentiment Analysis Process ...

Hasil Test Parameter ke - 2
Parameter  => (4, 350, 40, 10)
Accuracy   => 0.7172131147540983
Classification Report => 
               precision    recall  f1-score   support

    negative    0.75600   0.91008   0.82591      1835
     neutral    0.53756   0.36640   0.43578       625
    positive    0.68601   0.42949   0.52825       468

    accuracy                        0.71721      2928
   macro avg    0.65985   0.56866   0.59665      2928
weighted avg    0.69818   0.71721   0.69506      2928


3.	 Parameter : (4, 400, 40, 10) 


Training model Word2Vec ...
Review 0 of 11712
Review 1000 of 11712
Review 2000 of 11712
Review 3000 of 11712




Review 4000 of 11712
Review 5000 of 11712
Review 6000 of 11712
Review 7000 of 11712
Review 8000 of 11712
Review 9000 of 11712
Review 10000 of 11712
Review 11000 of 11712
Review 0 of 2928
Review 1000 of 2928
Review 2000 of 2928

Sentiment Analysis Process ...

Hasil Test Parameter ke - 3
Parameter  => (4, 400, 40, 10)
Accuracy   => 0.7161885245901639
Classification Report => 
               precision    recall  f1-score   support

    negative    0.75248   0.90954   0.82359      1835
     neutral    0.54779   0.37600   0.44592       625
    positive    0.68683   0.41239   0.51535       468

    accuracy                        0.71619      2928
   macro avg    0.66237   0.56598   0.59495      2928
weighted avg    0.69829   0.71619   0.69371      2928


4.	 Parameter : (4, 450, 40, 10) 


Training model Word2Vec ...
Review 0 of 11712
Review 1000 of 11712
Review 2000 of 11712




Review 3000 of 11712
Review 4000 of 11712
Review 5000 of 11712
Review 6000 of 11712
Review 7000 of 11712
Review 8000 of 11712
Review 9000 of 11712
Review 10000 of 11712
Review 11000 of 11712
Review 0 of 2928
Review 1000 of 2928
Review 2000 of 2928

Sentiment Analysis Process ...

Hasil Test Parameter ke - 4
Parameter  => (4, 450, 40, 10)
Accuracy   => 0.7110655737704918
Classification Report => 
               precision    recall  f1-score   support

    negative    0.74498   0.90899   0.81885      1835
     neutral    0.53810   0.36160   0.43254       625
    positive    0.69888   0.40171   0.51018       468

    accuracy                        0.71107      2928
   macro avg    0.66065   0.55743   0.58719      2928
weighted avg    0.69345   0.71107   0.68705      2928


5.	 Parameter : (4, 500, 40, 10) 


Training model Word2Vec ...
Review 0 of 11712
Review 1000 of 11712
Review 2000 of 11712




Review 3000 of 11712
Review 4000 of 11712
Review 5000 of 11712
Review 6000 of 11712
Review 7000 of 11712
Review 8000 of 11712
Review 9000 of 11712
Review 10000 of 11712
Review 11000 of 11712
Review 0 of 2928
Review 1000 of 2928
Review 2000 of 2928

Sentiment Analysis Process ...

Hasil Test Parameter ke - 5
Parameter  => (4, 500, 40, 10)
Accuracy   => 0.717896174863388
Classification Report => 
               precision    recall  f1-score   support

    negative    0.75157   0.91172   0.82393      1835
     neutral    0.55875   0.37280   0.44722       625
    positive    0.68772   0.41880   0.52058       468

    accuracy                        0.71790      2928
   macro avg    0.66601   0.56777   0.59725      2928
weighted avg    0.70021   0.71790   0.69504      2928


6.	 Parameter : (4, 550, 40, 10) 


Training model Word2Vec ...
Review 0 of 11712
Review 1000 of 11712
Review 2000 of 11712
Review 3000 of 11712




Review 4000 of 11712
Review 5000 of 11712
Review 6000 of 11712
Review 7000 of 11712
Review 8000 of 11712
Review 9000 of 11712
Review 10000 of 11712
Review 11000 of 11712
Review 0 of 2928
Review 1000 of 2928
Review 2000 of 2928

Sentiment Analysis Process ...

Hasil Test Parameter ke - 6
Parameter  => (4, 550, 40, 10)
Accuracy   => 0.7141393442622951
Classification Report => 
               precision    recall  f1-score   support

    negative    0.75056   0.91008   0.82266      1835
     neutral    0.53349   0.36960   0.43667       625
    positive    0.70370   0.40598   0.51491       468

    accuracy                        0.71414      2928
   macro avg    0.66258   0.56189   0.59141      2928
weighted avg    0.69674   0.71414   0.69108      2928


7.	 Parameter : (4, 600, 40, 10) 


Training model Word2Vec ...
Review 0 of 11712
Review 1000 of 11712
Review 2000 of 11712




Review 3000 of 11712
Review 4000 of 11712
Review 5000 of 11712
Review 6000 of 11712
Review 7000 of 11712
Review 8000 of 11712
Review 9000 of 11712
Review 10000 of 11712
Review 11000 of 11712
Review 0 of 2928
Review 1000 of 2928
Review 2000 of 2928

Sentiment Analysis Process ...

Hasil Test Parameter ke - 7
Parameter  => (4, 600, 40, 10)
Accuracy   => 0.7155054644808743
Classification Report => 
               precision    recall  f1-score   support

    negative    0.74877   0.91117   0.82203      1835
     neutral    0.54916   0.36640   0.43954       625
    positive    0.69784   0.41453   0.52011       468

    accuracy                        0.71551      2928
   macro avg    0.66526   0.56403   0.59389      2928
weighted avg    0.69802   0.71551   0.69212      2928


Best Parameter  => (4, 300, 40, 10)
Accuracy   => 0.719603825136612


In [49]:
print(best_score)
print(best_parameter)

[0.7134562841530054, 0.7151639344262295, 0.7175546448087432, 0.717896174863388, 0.7182377049180327, 0.719603825136612, 0.7172131147540983, 0.7161885245901639, 0.7110655737704918, 0.717896174863388, 0.7141393442622951, 0.7155054644808743]
[(4, 300, 40, 10), (4, 350, 40, 10), (4, 400, 40, 10), (4, 450, 40, 10), (4, 500, 40, 10), (4, 300, 40, 10), (4, 350, 40, 10), (4, 400, 40, 10), (4, 450, 40, 10), (4, 500, 40, 10), (4, 550, 40, 10), (4, 600, 40, 10)]
