In [1]:
import pandas as pd
import numpy as np
import malaya
import pyLDAvis
from textblob import TextBlob

# import dataset
df = pd.read_csv('tweet_concat.csv')
#list(df)
df.shape

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""


(713, 31)

In [2]:
# features extraction
features = ['created_at','id','user','full_text']
data = df[features]
data.full_text = data.full_text.astype('str')
text = data.full_text
text_list = text.values.tolist() # model requires list as input

In [3]:
# language detection
mn_lang = malaya.language_detection.multinomial()
lang = mn_lang.predict_batch(text_list)

# add lang column
data['lang'] = lang
data.lang.value_counts() # ISSUE: model interprets malay as other

ENGLISH      331
OTHER        216
MALAY        160
INDONESIA      6
Name: lang, dtype: int64

In [4]:
# separate into english and malay
english = data[data['lang'] == 'ENGLISH']
english_text = english[['full_text']]
english_text_list = english_text.full_text.values.tolist() # model requires list as input

# other and indonesia mostly consist of malay based on observation
malay = data[data['lang'] != 'ENGLISH']
malay_text = malay[['full_text']]
malay_text_list = malay_text.full_text.values.tolist() # model requires list as input

In [5]:
# sentiment analysis for english
english_sentiment = []
for tweet in english_text_list:
    blob = TextBlob(tweet)
    analysis = blob.sentiment
    if analysis[0]>=0:
        english_sentiment.append('positive')
    elif analysis[0]<0:
        english_sentiment.append('negative')

# add sentiment column
english['sentiment'] = english_sentiment
english.sentiment.value_counts()

positive    258
negative     73
Name: sentiment, dtype: int64

In [6]:
# sentiment analysis for malay
malay_sentiment_xgb = malaya.sentiment.xgb()
malay_sentiment = malay_sentiment_xgb.predict_batch(malay_text_list) # get_proba=True

# add sentiment column
malay['sentiment'] = malay_sentiment
malay.sentiment.value_counts()

positive    248
negative    134
Name: sentiment, dtype: int64

In [7]:
# topic modeling for english
stopwords_eng = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
lda = malaya.topic_model.lda(english_text_list,10,stemming=False,vectorizer='skip-gram',ngram=(1,4),skip=3,stop_words=stopwords_eng)

In [8]:
prepared_data = lda.visualize_topics(notebook_mode = True)

In [9]:
pyLDAvis.save_html(prepared_data, 'pylda.html')

In [10]:
# topic modeling for malay
lda2vec = malaya.topic_model.lda(malay_text_list,10,stemming=False,vectorizer='skip-gram',ngram=(1,4),skip=3)

In [11]:
lda2vec.visualize_topics(notebook_mode = True)

In [12]:
# ensemble
output = english.merge(malay, how='outer')
output

Unnamed: 0,created_at,id,user,full_text,lang,sentiment
0,Tue Apr 09 13:22:40 +0000 2019,1115605950998560768,"{'id': 136104635, 'id_str': '136104635', 'name...",Just wanna ask whether u prefer sleep on d pro...,ENGLISH,negative
1,Tue Apr 09 12:59:52 +0000 2019,1115600213484265477,"{'id': 1065994908148760578, 'id_str': '1065994...",hey unifi can you help me,ENGLISH,positive
2,Tue Apr 09 11:59:00 +0000 2019,1115584897567973382,"{'id': 175719741, 'id_str': '175719741', 'name...",Check out my Speedtest results 323 5 Mbps down...,ENGLISH,negative
3,Tue Apr 09 11:53:27 +0000 2019,1115583501628108800,"{'id': 94323843, 'id_str': '94323843', 'name':...",Join our postpaid plan OR refer 5 frens,ENGLISH,positive
4,Tue Apr 09 11:53:27 +0000 2019,1115583499300245504,"{'id': 94323843, 'id_str': '94323843', 'name':...",khabarbaik ED SHEERAN tickets for you YES you ...,ENGLISH,positive
5,Tue Apr 09 11:49:20 +0000 2019,1115582464322297857,"{'id': 94323843, 'id_str': '94323843', 'name':...",khabarbaik They are ready to win the RM10 000 ...,ENGLISH,positive
6,Tue Apr 09 09:34:03 +0000 2019,1115548417806102529,"{'id': 339368391, 'id_str': '339368391', 'name...",stupid unifi always down please don t be afrai...,ENGLISH,negative
7,Tue Apr 09 09:10:09 +0000 2019,1115542405065764864,"{'id': 44583755, 'id_str': '44583755', 'name':...",My wish is to earn just enough to pay for my c...,ENGLISH,negative
8,Tue Apr 09 06:16:28 +0000 2019,1115498696991334400,"{'id': 96706317, 'id_str': '96706317', 'name':...",The sendu laif of streamyx user Whenever it s ...,ENGLISH,positive
9,Tue Apr 09 05:35:51 +0000 2019,1115488475481018368,"{'id': 265909611, 'id_str': '265909611', 'name...",Malaysia Singapore agree to find amicable solu...,ENGLISH,positive
