In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Importing library
import numpy as np
import pandas as pd

# BeautifulSoup = hapus tag html
from bs4 import BeautifulSoup 
import re # regular expressions (regex)

# natural language tool kits
from nltk.corpus import stopwords
import nltk

# word2vec library
from gensim.models import word2vec
import itertools

# Packages required for data preparation
from sklearn.model_selection import train_test_split

# library untuk Random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [4]:
# path dataset
data_path = r'/content/drive/MyDrive/Text Dataset/Tweets.csv'
dataset = pd.read_csv(data_path, header=0, sep = ',')

dataset = dataset.reindex(np.random.permutation(dataset.index))  

In [5]:
dataset = dataset[['text', 'airline_sentiment']]
# dataset.text = dataset.text.apply(remove_stopwords).apply(remove_mentions)

dataset.head(10)

Unnamed: 0,text,airline_sentiment
14471,"@AmericanAir @GolfWithWoody Don't buy it, Wood...",negative
6464,@SouthwestAir thanks! I expected a wait... Jus...,negative
9154,@USAirways the disappointment was not the bag ...,negative
7414,@JetBlue from San Diego with a 4 hour delay to...,negative
8466,@JetBlue Amazingly Awesome customer service fr...,positive
12311,@AmericanAir Connection is US2065,neutral
12716,@AmericanAir right so I missed my connection /...,negative
6304,@SouthwestAir she cut people off 35 minutes be...,negative
8901,@JetBlue @KyleJudah It doesn't matter who you ...,negative
7644,@JetBlue my flight 475 Cancelled Flighted due ...,negative


In [6]:
X_train, X_test, y_train, y_test = train_test_split(dataset.text, dataset.airline_sentiment, test_size = 0.2, random_state = 42)

In [7]:
# proprocessing data teks
def review_wordlist(review, remove_stopwords=False):
    
    # hapus simbol
    review_text = re.sub("[^a-zA-Z]"," ",review)
    
    # hapus @
    review_text = re.sub("([^\s\w]|_@?)+"," ", review_text)

    # konversi ke huruf kecil dan dipisah perkata
    words = review_text.lower().split()

    # menghapus stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return(words)

In [8]:
# download file punctuation and stpwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
# word2vec expects a list of lists.
# Using punkt tokenizer for better splitting of a paragraph into sentences.

import nltk.data
#nltk.download('popular')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [10]:
# Memisah review per kalimat
def review_sentences(review, tokenizer, remove_stopwords=False):
    
    # melakukan tokenize dengan nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []

    # mengisi array sentences dengan masing - masing review
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence, remove_stopwords))

    # list dari list
    return sentences

In [11]:
sentences = []
print("Parsing sentences from training set")
for review in X_train:
    sentences += review_sentences(review, tokenizer)

Parsing sentences from training set


In [12]:
print("List dari lists. Cek tipe data : ", type(sentences), " of ", type(sentences[0]))
# print(sentences)
print(sentences[105])
print(len(sentences))

List dari lists. Cek tipe data :  <class 'list'>  of  <class 'list'>
['united', 'can', 'you', 'explain', 'how', 'it', 'rewards', 'us', 'elite', 'premiere', 'k', 'members']
23447


In [13]:
# membuat model dan mengatur nilai parameter
num_features = 300  # Word vector dimensionality

In [15]:
# Inisialisasi model
from gensim.models import word2vec
# model = word2vec.Word2Vec(workers = num_workers, 
#                           size = num_features, 
#                           min_count = min_word_count, 
#                           window = context, 
#                           sg = 1, # sg = 1 (skipgram), default/0 adalah cbow
#                           sample = downsampling)
model = word2vec.Word2Vec(size = 300)
model.build_vocab(sentences)

In [16]:
print("Training model....")
# model.train(sentences = sentences, total_examples = len(sentences), epochs = model.iter)
model.train(sentences = sentences, total_examples = len(sentences), epochs = 2)

Training model....


(284590, 422470)

In [17]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [18]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [20]:
# Calculating average feature vector for training set
clean_train_reviews = []
for review in X_train:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=True))
    
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

Review 0 of 11712
Review 1000 of 11712


  del sys.path[0]


Review 2000 of 11712
Review 3000 of 11712
Review 4000 of 11712
Review 5000 of 11712
Review 6000 of 11712
Review 7000 of 11712
Review 8000 of 11712
Review 9000 of 11712
Review 10000 of 11712
Review 11000 of 11712


In [21]:
# Calculating average feature vactors for test set     
clean_test_reviews = []
for review in X_test:
    clean_test_reviews.append(review_wordlist(review,remove_stopwords=True))
    
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Review 0 of 2928
Review 1000 of 2928


  del sys.path[0]


Review 2000 of 2928


In [22]:
# Fitting a random forest classifier to the training data
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100)
    
print("Fitting random forest to training data....")    
forest = forest.fit(trainDataVecs, y_train)

# Prediksi nilai sentiment untuk test data 
predicted = forest.predict(testDataVecs)

# akurasi
accuracy = accuracy_score(y_test, predicted)
report = classification_report(y_test, predicted, digits = 5)

Fitting random forest to training data....


In [23]:
print(report)

              precision    recall  f1-score   support

    negative    0.71192   0.93733   0.80922      1835
     neutral    0.56842   0.25755   0.35449       629
    positive    0.61674   0.30172   0.40521       464

    accuracy                        0.69057      2928
   macro avg    0.63236   0.49887   0.52297      2928
weighted avg    0.66601   0.69057   0.64751      2928

