In [1]:
import numpy as np
import pandas as pd
import os
import re
import nltk
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
from pprint import pprint

warnings.filterwarnings('ignore')

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import spacy

In [2]:
SOURCE_FILE = 'data/posts_data_gab.csv'

print('[debug] labeled words')
rootdir = "data"
positive_words = []
negative_words = []
neutral_words = []

for directories, subdirs, files in os.walk(rootdir):
    if (os.path.split(directories)[1] == '1' or os.path.split(directories)[1] == '2' or os.path.split(directories)[1] == '3' or os.path.split(directories)[1] == '4'):
        for filename in files:
            if (filename == 'positive.txt'):
                with open(os.path.join(directories, filename)) as f:
                    for line in f:
                        positive_words.append(line.strip())
            if (filename == 'negative.txt'):
                with open(os.path.join(directories, filename)) as f:
                    for line in f:
                        negative_words.append(line.strip())
            if (filename == 'neutral.txt'):
                with open(os.path.join(directories, filename), encoding='latin-1') as f:
                    for line in f:
                        neutral_words.append(line.strip())

[debug] labeled words


In [3]:
df = pd.read_csv('data/posts_data_gab.csv')
data = df['Body']
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [4]:
print('[debug] stopwords')
nltk.download('stopwords')
en_stop = nltk.corpus.stopwords.words('english')
en_stop.extend(['from', 'subject', 're', 'edu', 'use'])
fil_stop = []
with open('assets/stop_words_ph.txt') as f:
    for line in f:
        fil_stop.append(line.rstrip('\n')) 
nltk.download('wordnet')

[debug] stopwords
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [6]:
print('[debug] word2vec')
text_data = []
counter = 0
index = 0

for line in data:
    tokens = gensim.utils.simple_preprocess(line)
    tokens = [re.sub('\#ADMUFreedomWall\ \d+', '', sent) for sent in tokens]
    tokens = [re.sub('\#ADMUFreedomWall\d+', '', sent) for sent in tokens]
    tokens = [word for word in tokens if word not in fil_stop]
    tokens = [word for word in tokens if word not in en_stop]
    tokens = [word for word in tokens if len(word) > 2]
    tokens = [get_lemma(token) for token in tokens]
    
    text_data.append(tokens)
    counter += len(tokens)
    index+=1
    
print('Words in text data: ', counter)
numbers_set = set(i for j in text_data for i in j)
print('Unique words:', len(numbers_set))
# print()

[debug] word2vec
Words in text data:  24577
Unique words: 5630


In [7]:
model = gensim.models.Word2Vec(text_data, size=50, window=5, min_count=5, workers=4)
model.train(text_data, total_examples=len(text_data) , epochs=100)
# For words that dont occur as much
model_sparse = gensim.models.Word2Vec(text_data, size=50, window=5, min_count=1, workers=4)
model_sparse.train(text_data, total_examples=len(text_data) , epochs=100)

(2303600, 2457700)

In [8]:
print('[debug] tf-idf')
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data)
feature_names = vectorizer.get_feature_names()
tf_idf_list = []
for i in range(len(data)):
    sentence_dict = {}
    feature_index = tfidf_matrix[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        sentence_dict[w] = s
    tf_idf_list.append(sentence_dict)

[debug] tf-idf


In [88]:
def get_sentiment(sentence):
    tokens = sentence.split(' ')
    sum_score = 0
    relevant_words = 0
    
    for token in tokens:
        if token in positive_words:
            print("Word: " + token + ", score: 1")
            print()
            sum_score += 1
            relevant_words += 1
        elif token in negative_words:
            print("Word: " + token + ", score: -1")
            print()
            sum_score -= 1
            relevant_words += 1
        elif token in neutral_words:
            print("Word: " + token)
            try:
                neighbors = model.wv.most_similar(positive=token, topn=100)
            except:
                try:
                    neighbors = model_sparse.wv.most_similar(positive=token,topn=100)
                except:
                    pass
            for word in neighbors:
                if word[0] in positive_words:
                    print("Nearest labelled word: " + word[0] + ", score: " + str(word[1]))
                    print()
                    sum_score += word[1]
                    relevant_words += 1
                    break
                elif word[0] in negative_words:
                    print("Nearest labelled word: " + word[0] + ", score: -" + str(word[1]))
                    print()
                    sum_score -= word[1]
                    relevant_words += 1
                    break
        else:
            pass
#             print('"' + token + '" not in model')
    val = sum_score/relevant_words
    return val

In [10]:
# topic modeling part
print('[debug] start topic modeling part')
text_data = []
df = pd.read_csv(SOURCE_FILE)
data = df['Body']
data.dropna(inplace=True)

data = df['Body'].values.tolist()
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]
data = [re.sub('\#ADMUFreedomWall\ \d+', '', sent) for sent in data]
data = [re.sub('\#ADMUFreedomWall\d+', '', sent) for sent in data]

[debug] start topic modeling part


In [11]:
print('[debug] tokens')
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))

[debug] tokens


In [12]:
print('[debug] bigrams and trigrams')print('[debug] bigrams and trigrams')
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

[debug] bigrams and trigrams


In [13]:
def remove_stopwords(texts):
    output = [[word for word in simple_preprocess(str(doc)) if (word not in en_stop and word not in fil_stop)] for doc in texts]
    return output
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
print('[debug] lemmatize')
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load('en', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

[debug] lemmatize


In [15]:
print('[debug] creating corpus')
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

[debug] creating corpus


In [25]:
print('[debug] building LDA model')
num_topics = 20
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
doc_lda = lda_model[corpus]
mallet_path = 'C:\\Mallet/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('Coherence Score: ', coherence_ldamallet)

[debug] building LDA model
Coherence Score:  0.42061005578430144


In [26]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

optimal_model = ldamallet
model_topics = optimal_model.show_topics(formatted=False)

In [54]:
print('[debug] topic interpretation')
counter = 0
topic_df = pd.DataFrame()
curr_topics = []
while True:
    try:
        curr_topic = optimal_model.show_topic(counter)
        curr_topics.append([topic[0] for topic in curr_topic])
        counter += 1
    except:
        break
topic_df['Topic Keywords'] = curr_topics

[debug] topic interpretation


In [81]:
def get_topic(sentence):
    bow = id2word.doc2bow(simple_preprocess(sentence))
    topic_weights = optimal_model[bow]
    topic = [t[1] for t in topic_weights]
    topic_id = np.argmax(topic)
    return (topic_id)

def get_sentence(sentence):
    test_string = sentence
    try:
        sent = get_sentiment(test_string)
        topic = get_topic(test_string)
        print(f'Sentiment: {sent}')
        print(f'Topic: {topic}')
    except Exception as e:
        print('Error:', e)
    

In [82]:
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

[(0,
  '0.075*"school" + 0.027*"lot" + 0.024*"university" + 0.019*"learn" + '
  '0.019*"turn" + 0.019*"care" + 0.017*"write" + 0.017*"matter" + '
  '0.016*"light" + 0.016*"comment"'),
 (1,
  '0.067*"work" + 0.058*"hard" + 0.031*"sem" + 0.027*"group" + 0.027*"pass" + '
  '0.027*"give" + 0.027*"cheat" + 0.026*"prof" + 0.024*"grade" + 0.024*"fail"'),
 (2,
  '0.046*"hate" + 0.027*"die" + 0.026*"realize" + 0.026*"family" + '
  '0.025*"joke" + 0.019*"kill" + 0.018*"notice" + 0.016*"depression" + '
  '0.015*"tweet" + 0.014*"freshie"'),
 (3,
  '0.078*"people" + 0.069*"thing" + 0.039*"wrong" + 0.028*"problem" + '
  '0.027*"understand" + 0.026*"country" + 0.021*"support" + 0.021*"agree" + '
  '0.020*"bad" + 0.018*"duterte"'),
 (4,
  '0.052*"final" + 0.038*"cute" + 0.032*"doo_doo" + 0.027*"wanna" + '
  '0.025*"great" + 0.024*"tag" + 0.022*"open" + 0.019*"meme" + 0.014*"lie" + '
  '0.014*"pic"'),
 (5,
  '0.048*"hope" + 0.035*"stay" + 0.028*"point" + 0.028*"lose" + 0.027*"show" + '
  '0.025*"give" 

In [92]:
get_sentence("Looking for any Psych major who can do my Psych 101 final paper for me. Willing to pay. Thanks")

Word: major
Nearest labelled word: desperate, score: -0.6196975111961365

Word: can
Nearest labelled word: desperate, score: -0.6196975111961365

Word: do
Nearest labelled word: boring, score: -0.49341583251953125

Word: final
Nearest labelled word: mayabang, score: -0.39030808210372925

Word: paper
Nearest labelled word: huhu, score: -0.34044116735458374

Sentiment: -0.49271202087402344
Topic: 1
