In [1]:
import pandas as pd
import numpy as np
import malaya
import pyLDAvis
from textblob import TextBlob

# import dataset
df = pd.read_csv('tweet_concat.csv')
#list(df)
df.shape

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""


(713, 31)

In [2]:
# features extraction
features = ['created_at','id','user','full_text']
data = df[features]
data.full_text = data.full_text.astype('str')
text = data.full_text
text_list = text.values.tolist() # model requires list as input

In [3]:
# language detection
mn_lang = malaya.language_detection.multinomial()
lang = mn_lang.predict_batch(text_list)

# add lang column
data['lang'] = lang
data.lang.value_counts() # ISSUE: model interprets malay as other

ENGLISH      331
OTHER        216
MALAY        160
INDONESIA      6
Name: lang, dtype: int64

In [4]:
# separate into english and malay
english = data[data['lang'] == 'ENGLISH']
english_text = english[['full_text']]
english_text_list = english_text.full_text.values.tolist() # model requires list as input

# other and indonesia mostly consist of malay based on observation
malay = data[data['lang'] != 'ENGLISH']
malay_text = malay[['full_text']]
malay_text_list = malay_text.full_text.values.tolist() # model requires list as input

In [5]:
# sentiment analysis and subjectivity analysis for english
english_sentiment = []
english_subjectivity = []

for tweet in english_text_list:
    blob = TextBlob(tweet)
    analysis = blob.sentiment
    sentiment = analysis[0]
    subject = analysis[1]
    # scale of -1 to 1
    if sentiment>0:
        english_sentiment.append('positive')
    elif sentiment<0:
        english_sentiment.append('negative')
    else:
        english_sentiment.append('neutral')
    # scale of 0 to 1
    if subject>0.6:
        english_subjectivity.append('subjective')
    elif subject<0.4:
        english_subjectivity.append('objective')
    else:
        english_subjectivity.append('mixed')
    
# add sentiment and subjectivity column
english['sentiment'] = english_sentiment
english['subjectivity'] = english_subjectivity
english.sentiment.value_counts()

positive    135
neutral     123
negative     73
Name: sentiment, dtype: int64

In [6]:
english.subjectivity.value_counts()

objective     172
mixed          88
subjective     71
Name: subjectivity, dtype: int64

In [10]:
my_sentiment = []
my_subjective = []
# sentiment analysis for malay
malay_sentiment_xgb = malaya.sentiment.xgb()
malay_sentiment = malay_sentiment_xgb.predict_batch(malay_text_list, get_proba=True) # get_proba=True

# create list of sentiments(positive/negative/neutral)
for item in malay_sentiment:
    if item['negative'] > 0.45 and item['negative'] < 0.55:
        my_sentiment.append("neutral")
    elif item['negative'] > item['positive']:
        my_sentiment.append("negative")
    else:
        my_sentiment.append("positive")

# subjectivity analysis for malay
malay_subjective_xgb = malaya.subjective.xgb()
malay_subjective = malay_subjective_xgb.predict_batch(malay_text_list, get_proba=True)
# create list of subjectivity(positive/negative/neutral)
for item in malay_subjective:
    if item['negative'] > 0.45 and item['negative'] < 0.55:
        my_subjective.append("mixed")
    elif item['negative'] > item['positive']:
        my_subjective.append("objective")
    else:
        my_subjective.append("subjective")
        
# add sentiment & subjective column
malay['sentiment'] = my_sentiment
malay['subjectivity'] = my_subjective
malay.sentiment.value_counts()

positive    222
negative    115
neutral      45
Name: sentiment, dtype: int64

In [8]:
malay.subjectivity.value_counts()

Negative    280
Positive     74
Neutral      28
Name: subjectivity, dtype: int64

In [8]:
# topic modeling for english
stopwords_eng = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
english_lda = malaya.topic_model.lda(english_text_list,10,stemming=False,vectorizer='skip-gram',ngram=(1,4),skip=3,stop_words=stopwords_eng)

In [None]:
# return embeded visualization data
english_vis = english_lda.visualize_topics(notebook_mode = False)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [17/Apr/2019 23:43:56] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [17/Apr/2019 23:43:56] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [17/Apr/2019 23:43:56] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [17/Apr/2019 23:43:56] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...


In [None]:
# top 10 english topics
english_topics = english_lda.top_topics(5, top_n=10, return_df=True)

In [None]:
# topic modeling for malay
malay_lda = malaya.topic_model.lda(malay_text_list,10,stemming=False,vectorizer='skip-gram',ngram=(1,4),skip=3)

In [None]:
malay_vis = malay_lda.visualize_topics(notebook_mode = False)

In [None]:
# top 10 malay topics
malay_topics = malay_lda.top_topics(5, top_n=10, return_df=True)

In [None]:
# ensemble
output = english.merge(malay, how='outer')
output

# save to html file for deployment
pyLDAvis.save_html(english_vis, 'english_vis.html')
pyLDAvis.save_html(malay_vis, 'malay_vis.html')
# export dataframe for mobile deployment
english_topics
malay_topics