In [1]:
SOURCE_FILE = 'data/posts_data_gab.csv'

import re
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
fil_stop_words = []
with open('assets/stop_words_ph.txt') as f:
    for word in f:
        fil_stop_words.append(word.rstrip('\n'))

In [3]:
# Import Dataset
text_data = []
df = pd.read_csv(SOURCE_FILE)
data = df['Body']
data.dropna(inplace=True)

In [4]:
df[0:15]

Unnamed: 0,Title,Body,Submitted
0,18294,"there’s this guy who’s been making too much malaswa, dagger stares at me, even if I never done anything wrong with him. The way he looked at me is like he’s wants to either beat me up physically or even sexually. he’s too old to act so irrational.\r\n\r\nAteneo pls help make the school a better place rather than bring it a “malaswa playground”","December 9, 2018 2:44:26 AM +08"
1,18293,You call yourself my best friend pero you cant even give 5 mins of your time i needed someone to talk tapos tangina you said stuff pa to make me feel bad about myself\r\n\r\nGet your head out of your fucking ass and stop thinking na everyone wants to make out with u,"December 9, 2018 2:44:05 AM +08"
2,18292,#Daddy Jett 🙌,"December 9, 2018 2:43:23 AM +08"
3,18291,"https://www.facebook.com/braveyawtz/videos/1744836578868329/\r\n\r\nIt's crazy to realize how much all the attention on the 'war on drugs' has died down. Seeing this video was reminiscent to how drug cartels would summarily execute people off the streets. If people are dying left and right, and the notion of labeling their deaths as 'police self defence,' or that the victims fought back, and leaving it at that, then no one is safe to this kind of cruelty.\r\n Tomorrow, the very people being left to die on the streets just might be me and you. Unless of course you're of a rich and/or influential family. Move along, I guess this shouldn't concern you.","December 9, 2018 2:16:15 AM +08"
4,18290,"I saw this girl last thursday. She was hanging out with a bunch of Icans and while I was going down the stairs, I tripped. All of them laughed except for one, she helped me and even asked if I was okay. I was so flustered and just stared at her face. She had the most amazing smile and I couldn't even utter a word. It was funny because she actually got mad at her friends for laughing at me. And as she was helping me pick up my things, she giggled, not bec\r\nause of me but because she also fell in those damn stairs in the library and she felt like shit when people just stared at her so she didn't want the same thing to happen to me. She had such a distinctive laugh, the best one I have ever heard so far. She had to leave before I could even thank her and ask for her name. All I know is that she was wearing the film org lanyard. She might be an Ican because I knew some of the people she was with were from that school. I really want to see this girl again. I have yet to thank her and I really am looking forward to personally meet someone with her kindness. If any of you know her, please tag her. Because I think I just met my soul mate.","December 9, 2018 2:15:33 AM +08"
5,18289,"ok so...\r\n\r\nsince we do well in UAAP Women's volleyball, can the team members post shirtless pics on their Instagram as a service to the THOUSANDS of fans like me who watched their games thoughout this season?????????? I probably spent more than 5k for tickets this season\r\n Hello to all the hot gals in the team. pls tag them here :(#ADMUFreedomWall18235","December 9, 2018 2:01:45 AM +08"
6,18288,Only fans will know this #teamdentistjet,"December 9, 2018 1:53:15 AM +08"
7,18287,mood,"December 9, 2018 1:49:07 AM +08"
8,18286,,
9,18285,u know there's something wrong w the prof when their students ask other classes for notes and resources because their own prof didn't fucking teach them anything,"December 9, 2018 1:09:35 AM +08"


In [5]:
# Convert to list
data = df['Body'].values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

# Remove Other ADMU FW mentions
data = [re.sub('\#ADMUFreedomWall\ \d+', '', sent) for sent in data]
data = [re.sub('\#ADMUFreedomWall\d+', '', sent) for sent in data]

pprint(data[:15])

['there’s this guy who’s been making too much malaswa, dagger stares at me, '
 'even if I never done anything wrong with him. The way he looked at me is '
 'like he’s wants to either beat me up physically or even sexually. he’s too '
 'old to act so irrational. Ateneo pls help make the school a better place '
 'rather than bring it a “malaswa playground”',
 'You call yourself my best friend pero you cant even give 5 mins of your time '
 'i needed someone to talk tapos tangina you said stuff pa to make me feel bad '
 'about myself Get your head out of your fucking ass and stop thinking na '
 'everyone wants to make out with u',
 '#Daddy Jett 🙌',
 'https://www.facebook.com/braveyawtz/videos/1744836578868329/ Its crazy to '
 'realize how much all the attention on the war on drugs has died down. Seeing '
 'this video was reminiscent to how drug cartels would summarily execute '
 'people off the streets. If people are dying left and right, and the notion '
 'of labeling their deaths as poli

In [6]:
# Tokenizing
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['there', 'this', 'guy', 'who', 'been', 'making', 'too', 'much', 'malaswa', 'dagger', 'stares', 'at', 'me', 'even', 'if', 'never', 'done', 'anything', 'wrong', 'with', 'him', 'the', 'way', 'he', 'looked', 'at', 'me', 'is', 'like', 'he', 'wants', 'to', 'either', 'beat', 'me', 'up', 'physically', 'or', 'even', 'sexually', 'he', 'too', 'old', 'to', 'act', 'so', 'irrational', 'ateneo', 'pls', 'help', 'make', 'the', 'school', 'better', 'place', 'rather', 'than', 'bring', 'it', 'malaswa', 'playground']]


In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['there', 'this', 'guy', 'who', 'been', 'making', 'too', 'much', 'malaswa', 'dagger', 'stares', 'at', 'me', 'even', 'if', 'never', 'done', 'anything', 'wrong', 'with', 'him', 'the', 'way', 'he', 'looked', 'at', 'me', 'is', 'like', 'he', 'wants', 'to', 'either', 'beat', 'me', 'up', 'physically', 'or', 'even', 'sexually', 'he', 'too', 'old', 'to', 'act', 'so', 'irrational', 'ateneo', 'pls', 'help', 'make', 'the', 'school', 'better', 'place', 'rather_than', 'bring', 'it', 'malaswa', 'playground']


In [8]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    output = [[word for word in simple_preprocess(str(doc)) if (word not in stop_words and word not in fil_stop_words)] for doc in texts]
    return output
    
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [9]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

  return f(*args, **kwds)
  return f(*args, **kwds)


In [10]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['guy', 'make', 'much', 'malaswa', 'dagger', 'stare', 'even', 'never', 'do', 'anything', 'wrong', 'way', 'look', 'want', 'beat', 'physically', 'even', 'sexually', 'old', 'act', 'irrational', 'ateneo', 'help', 'make', 'school', 'good', 'place', 'rather', 'bring', 'malaswa', 'playground']]


In [11]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1)]]


In [12]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=14, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [13]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.024*"game" + 0.021*"like" + 0.019*"big" + 0.018*"final" + 0.018*"watch" + '
  '0.017*"next" + 0.016*"ateneo" + 0.015*"basketball" + 0.013*"admin" + '
  '0.013*"reach"'),
 (1,
  '0.040*"yung" + 0.031*"see_translation" + 0.024*"university" + '
  '0.021*"jesuit" + 0.021*"instead" + 0.018*"paper" + 0.017*"email" + '
  '0.013*"putangina" + 0.013*"dlsu" + 0.012*"step"'),
 (2,
  '0.037*"shit" + 0.035*"grade" + 0.032*"miss" + 0.024*"wonder" + '
  '0.014*"notice" + 0.014*"tangina" + 0.013*"meeting" + 0.012*"fast" + '
  '0.012*"almost" + 0.011*"expect"'),
 (3,
  '0.039*"first" + 0.021*"block" + 0.020*"share" + 0.014*"join" + '
  '0.013*"pretty" + 0.013*"little" + 0.012*"especially" + 0.011*"kailangan" + '
  '0.010*"include" + 0.010*"sobrang"'),
 (4,
  '0.039*"life" + 0.030*"bad" + 0.029*"talk" + 0.024*"always" + 0.015*"time" + '
  '0.015*"wish" + 0.014*"guess" + 0.014*"call" + 0.014*"problem" + '
  '0.013*"never"'),
 (5,
  '0.033*"anyone" + 0.026*"kind" + 0.017*"damn" + 0.012*"finally"

In [14]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.139770385795106

Coherence Score:  0.495706615090119


In [15]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


# Using Mallet

In [16]:
mallet_path = 'C:\\Mallet/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=8, id2word=id2word)

In [17]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(0,
  [('feel', 0.05970772442588727),
   ('time', 0.05636743215031315),
   ('people', 0.037578288100208766),
   ('good', 0.03632567849686848),
   ('bad', 0.025052192066805846),
   ('thing', 0.021711899791231733),
   ('stop', 0.019206680584551147),
   ('friend', 0.018789144050104383),
   ('meet', 0.01837160751565762),
   ('leave', 0.017118997912317326)]),
 (1,
  [('ateneo', 0.05763567522086664),
   ('post', 0.04291123264619268),
   ('school', 0.02902818679007152),
   ('student', 0.026503996634413125),
   ('shit', 0.022297013041649136),
   ('atenean', 0.019772822885990745),
   ('hate', 0.01851072780816155),
   ('lose', 0.009676062263357174),
   ('prof', 0.009676062263357174),
   ('university', 0.009255363904080775)]),
 (2,
  [('people', 0.050084175084175085),
   ('talk', 0.03324915824915825),
   ('work', 0.029461279461279462),
   ('thing', 0.027777777777777776),
   ('fuck', 0.023989898989898988),
   ('hard', 0.023148148148148147),
   ('start', 0.022727272727272728),
   ('hope', 0.021885

In [18]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [19]:
# Can take a long time to run.
# model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)

In [20]:
# Show graph
# limit=40; start=2; step=6;
# x = range(start, limit, step)
# plt.plot(x, coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()

In [21]:
# Print the coherence scores
# for m, cv in zip(x, coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [22]:
# Select the model and print the topics
# optimal_model = model_list[1]
optimal_model = ldamallet
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

[(0,
  '0.060*"feel" + 0.056*"time" + 0.038*"people" + 0.036*"good" + 0.025*"bad" + '
  '0.022*"thing" + 0.019*"stop" + 0.019*"friend" + 0.018*"meet" + '
  '0.017*"leave"'),
 (1,
  '0.058*"ateneo" + 0.043*"post" + 0.029*"school" + 0.027*"student" + '
  '0.022*"shit" + 0.020*"atenean" + 0.019*"hate" + 0.010*"lose" + 0.010*"prof" '
  '+ 0.009*"university"'),
 (2,
  '0.050*"people" + 0.033*"talk" + 0.029*"work" + 0.028*"thing" + 0.024*"fuck" '
  '+ 0.023*"hard" + 0.023*"start" + 0.022*"hope" + 0.017*"make" + '
  '0.016*"understand"'),
 (3,
  '0.035*"class" + 0.025*"miss" + 0.018*"back" + 0.016*"sad" + 0.016*"year" + '
  '0.015*"day" + 0.015*"week" + 0.014*"sem" + 0.014*"end" + 0.013*"doo_doo"'),
 (4,
  '0.054*"make" + 0.038*"friend" + 0.033*"person" + 0.033*"people" + '
  '0.027*"love" + 0.027*"life" + 0.019*"group" + 0.015*"world" + 0.014*"part" '
  '+ 0.013*"stay"'),
 (5,
  '0.030*"game" + 0.027*"yung" + 0.023*"crush" + 0.023*"call" + '
  '0.022*"basketball" + 0.021*"watch" + 0.020*"fin

## Most dominant topic in a document

In [23]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

In [24]:
# Show dominant topic in a document
df_dominant_topic

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,2.0,0.1566,"people, talk, work, thing, fuck, hard, start, hope, make, understand","there’s this guy who’s been making too much malaswa, dagger stares at me, even if I never done anything wrong with him. The way he looked at me is like he’s wants to either beat me up physically or even sexually. he’s too old to act so irrational. Ateneo pls help make the school a better place rather than bring it a “malaswa playground”"
1,1,0.0,0.1679,"feel, time, people, good, bad, thing, stop, friend, meet, leave",You call yourself my best friend pero you cant even give 5 mins of your time i needed someone to talk tapos tangina you said stuff pa to make me feel bad about myself Get your head out of your fucking ass and stop thinking na everyone wants to make out with u
2,2,5.0,0.1394,"game, yung, crush, call, basketball, watch, final, see_translation, wrong, uaap",#Daddy Jett 🙌
3,3,4.0,0.2154,"make, friend, person, people, love, life, group, world, part, stay","https://www.facebook.com/braveyawtz/videos/1744836578868329/ Its crazy to realize how much all the attention on the war on drugs has died down. Seeing this video was reminiscent to how drug cartels would summarily execute people off the streets. If people are dying left and right, and the notion of labeling their deaths as police self defence, or that the victims fought back, and leaving it at that, then no one is safe to this kind of cruelty. Tomorrow, the very people being left to die on the streets just might be me and you. Unless of course youre of a rich and/or influential family. Move along, I guess this shouldnt concern you."
4,4,0.0,0.2674,"feel, time, people, good, bad, thing, stop, friend, meet, leave","I saw this girl last thursday. She was hanging out with a bunch of Icans and while I was going down the stairs, I tripped. All of them laughed except for one, she helped me and even asked if I was okay. I was so flustered and just stared at her face. She had the most amazing smile and I couldnt even utter a word. It was funny because she actually got mad at her friends for laughing at me. And as she was helping me pick up my things, she giggled, not bec ause of me but because she also fell in those damn stairs in the library and she felt like shit when people just stared at her so she didnt want the same thing to happen to me. She had such a distinctive laugh, the best one I have ever heard so far. She had to leave before I could even thank her and ask for her name. All I know is that she was wearing the film org lanyard. She might be an Ican because I knew some of the people she was with were from that school. I really want to see this girl again. I have yet to thank her and I really am looking forward to personally meet someone with her kindness. If any of you know her, please tag her. Because I think I just met my soul mate."
5,5,5.0,0.2226,"game, yung, crush, call, basketball, watch, final, see_translation, wrong, uaap","ok so... since we do well in UAAP Womens volleyball, can the team members post shirtless pics on their Instagram as a service to the THOUSANDS of fans like me who watched their games thoughout this season?????????? I probably spent more than 5k for tickets this season Hello to all the hot gals in the team. pls tag them here :("
6,6,5.0,0.1394,"game, yung, crush, call, basketball, watch, final, see_translation, wrong, uaap",Only fans will know this #teamdentistjet
7,7,1.0,0.1400,"ateneo, post, school, student, shit, atenean, hate, lose, prof, university",mood
8,8,1.0,0.1940,"ateneo, post, school, student, shit, atenean, hate, lose, prof, university",u know theres something wrong w the prof when their students ask other classes for notes and resources because their own prof didnt fucking teach them anything
9,9,4.0,0.1654,"make, friend, person, people, love, life, group, world, part, stay","Im not enough for anyone. Best friend/ almost/ safe choice. That is all I have ever been and will probably ever be. What is it thats wrong with me? Why cant anyone ever take me seriously? Why is it that whenever somebody likes me, they either end up falling out of love all too fast or dont feel satisfied enough with me to take things to the next level? Hanggang dito nalang ba talaga ako? Is it that Im not an attractive enough person to take seriously?"


## Topic Interpretation

In [39]:
counter = 0
topic_df = pd.DataFrame()
curr_topics = []
while True:
    try:
        curr_topic = optimal_model.show_topic(counter)
        curr_topics.append([topic[0] for topic in curr_topic])
        counter += 1
    except:
        break
topic_df['Topic Keywords'] = curr_topics

interpretations = [i for i in range(8)]
interpretations[0] = "Social Relations"
interpretations[1] = "University"
interpretations[2] = "?"
interpretations[3] = "Academics"
interpretations[4] = "Social Relations"
interpretations[5] = "Sports"
interpretations[6] = "Love and Relationships"
interpretations[7] = "?"

topic_df['Interpretation'] = interpretations
topic_df

Unnamed: 0,Topic Keywords,Interpretation
0,"[feel, time, people, good, bad, thing, stop, friend, meet, leave]",Social Relations
1,"[ateneo, post, school, student, shit, atenean, hate, lose, prof, university]",University
2,"[people, talk, work, thing, fuck, hard, start, hope, make, understand]",?
3,"[class, miss, back, sad, year, day, week, sem, end, doo_doo]",Academics
4,"[make, friend, person, people, love, life, group, world, part, stay]",Social Relations
5,"[game, yung, crush, call, basketball, watch, final, see_translation, wrong, uaap]",Sports
6,"[guy, girl, love, admu, play, study, cute, wanna, hear, country]",Love and Relationships
7,"[good, give, car, big, change, high, date, improve, find, end]",?


## Most dominant document in a topic

In [26]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.5491,"feel, time, people, good, bad, thing, stop, friend, meet, leave","You know Im honestly so frustrated about Grab Coco. I miss Regis Coco before it was just overflowing with Grab Drivers. Of course nothing is going to change because people can have Coco wherever they are, and Coco is earning so much more because the number of customers has skyrocketed. But Ive been going to Regis Coco because I love the place and I love hanging out with friends over there. It was a nice place to relax and have a good time. The ambiance was lively but not exhausting. It was homely but not boring. I have had wonderful memories in every seat, and have had laughs with some of the employees even. But the Grab drivers just ruin it. The line is so long, I could stand in line for 30 minutes. Even when I do get to order, I could wait an hour in my seat before I get my drink. Why? Because there are 10 Grab drivers, each ordering multiple drinks. Coco wasnt always like this. Of course there was usually a line, and of course there was always a wait time. But in the past few months, the lines have gotten unbearably long, and the waiting time has become ridiculous. Customers who still go there in-person suffer. There are chairs and table there because its also a sit-down place; part of Coco is the store, not just the drink. Its not just a hole in the wall that pumps out drinks to people. Im letting out my frustrations, but I know nothings gonna change. In fact, it could get even worse. Cocos earning, Grab drivers will have a consistent source of income, and people are comfortable having the Coco come to them. But for someone who has loved Coco for years, with part of that experience coming from sitting in the store and enjoying a few hours IN the physical store of Coco, I miss those days. I wish those days would come back, but I should know better. The old Coco is dead."
1,1.0,0.4751,"ateneo, post, school, student, shit, atenean, hate, lose, prof, university","Note: 99% of Ateneans dont agree with the ""many UP students being in Regis are annoying"" post. But Im not here for that, this is a university rank discussion post. The tweet that became popular said that Ateneo was just a 2nd rate University compared to U.P. https://mobile.twitter.com/piodu…/status/1044147879479111680 I know its a joke tweet but it reminded me of that age old discussion of University rankings. In the end, the general rank of your University is a far less important than the rank of your specific course. A minority of U.P students like to brag towards Ateneans, likewise a minority of Ateneans brag against DLSU and so on...but the university rankings are meaningless when you check the board exam passing rate of your specific course. The top 3 medical schools in the country arent even from the big 4. So course rank is a lot more important than school rank. Ofcourse, dont forget about mental health though."
2,2.0,0.6466,"people, talk, work, thing, fuck, hard, start, hope, make, understand","Dont just go to guidance, sanggu or ADSA right away. Talk to a lawyer first. This is unsolicited advice to seek legal advice to rape victims like in The guidance, sanggu or ADSA may or may not represent your interests, but your lawyer would. The guidance, sanggu or ADSA might (we wont really know for sure until AFTER THE FACT) be more interested in bringing the rapist to justice than giving you justice. You might think these two concepts ""bringing the perpetrator to justice"" and ""giving the victim justice"" are identical. If they are, then you can ignore this post. See 13 Reasons Why or any legal drama where rape victims are cross-examined furiously by defense attorneys. (For non-fictional examples, I wouldnt know of any that are relevant or relatable to non-lawyers. Lets hope 13 Reasons Why or those legal dramas are as realistic as they need to be here for this purpose) Prosecuting rapists is hard because our society values of privacy over security and allows defense attorneys to embarrass rape victims in court. Is Kevin Spacey or Charlie Rose in jail? Oh, did you forget that you might have to testify especially that you might not have a rape kit? This is not meant to show you there is no hope. This is meant to show you there might be some dangers without first seeking legal advice. Let me emphasize that this is not legal advice. This might not even be (good) advice on seeking legal advice. This is advice that you should seek legal advice!"
3,3.0,0.5383,"class, miss, back, sad, year, day, week, sem, end, doo_doo","Johnny shark~ doo doo doo doo doo doo Johnny shark~ doo doo doo doo doo doo Johnny shark~doo doo doo doo doo doo Yes, Papa. Telling lies~ doo doo doo doo doo doo Telling lies~ doo doo doo doo doo doo Telling lies~ doo doo doo doo doo doo No, Papa. Open mouth~ doo doo doo doo doo doo Open mouth~ doo doo doo doo doo doo Open mouth~ doo doo doo doo doo doo Hahaha."
4,4.0,0.5108,"make, friend, person, people, love, life, group, world, part, stay","This is the only place I can post this kasi Im a bit of a closet kpop fan. Anyways, I really like Twices song Yes or Yes. PERO!!! Naiinis ako sa chord progression ng verse. This is the original (taken from ultimate-guitar) E Emaj7 Naega ireokedo igijeogieotdeonga E7 E6 A Mwonga ireoke gatgo sipdeon jeok isseonna, isseonna Dsus2 E Da nolla, da nolla, nae ppeonppeonhame E C D (Come on and tell me yes) Now in general ok naman siya. I really love the step-down chord progression na medyo nagiging uso narin now (which iirc is actually a pretty old chord progression, like 70s or 80s nandun na siya, medyo 2000s lang sumikat ung I-IV-V chord progression). What really annoys me, is the Dsus2 part. Like, youre already going down the note na. From E-> Emaj7 -> E7 -> E6 -> A. Tapos babalik ka agad pataas with a Dsus2 e hindi ka pa nga nakakarating sa pupuntahan mo??? >_> Leave it at the end going back to E para mas fulfilling and sulit ung trip down. Guys try niyo. Instead of Dsus2, try a simple Am chord after A. It sounds at least 67.3% better than using Dsus2, without necessarily making the song a bit darker. If any Twice fans here can play the guitar or piano, let me know what you think hehe E Emaj7 Naega ireokedo igijeogieotdeonga E7 E6 A Mwonga ireoke gatgo sipdeon jeok isseonna, isseonna Am E Da nolla, da nolla, nae ppeonppeonhame E C D (Come on and tell me yes) P.S. I also love the E-> C-> D"
5,5.0,0.6454,"game, yung, crush, call, basketball, watch, final, see_translation, wrong, uaap","gusto ko lang sana sagutin yung taong to kung sino ka man 1. Para sa akin kasi requirement pa rin dapat ang Filipino. Ito ay dahil kahit na paulit-ulit na nating pinag-aaralan yan, kailangan pa rin nating isanay ang ating mga sarili sa paggamit nito nang tama na tipong magsisilbi siyang paraan para mas maintindihan ka ng mga ibang tao lalo na sa mga taong wala namang background sa English. 2. Para sa akin kasi kailangan mo ipractice nang ipractice ang See Translation Filipino kasi kung matagal mo na siyang hindi nagagamit sa tingin mo ba yung galing mo sa pagsalita ng Filipino noon parehas pa rin ba siya pagkatapos ng matagal na panahon ng hindi mo paggamit nito? Parang math lang yan, hindi porket napag-aralan mo na siya noon, maalala mo na agad kung paano mo isolve ang isang problem after ng ilang yrs na hindi mo paggamit nito. [Meron din akong kakilala na ilang years nasa ibang bansa pero hindi rin siya buong buhay niya doon, dahil nasa ibang bansa siya mas kinailangan niyang gamitin ang English kaysa sa Filipino. Siya na rin mismo nagsabi na nakalimutan na rin niya paano yung mga balarila sa Filipino. (May ganun talagang case believe it or not meng)] 3. Para kanino ba tayo? (Medyo pareho lang punto ko dito at sa 1) Hindi ba ito nakakalungkot? Para sa akin, kaya natin ito ginagawa kasi para makapagsilbi tayo sa mga taong wala namang oportunidad. Hindi ba kaya may wika kasi para mas magkaroon ng maayos na komunikasyon sa mga tao? Kung hindi naman mapapalalim ang kaalaman natin tungkol sa wika natin, paano tayo magiging matagumpay sa pagtulong ng mga taong nasa mga laylayan? Alam mo ba na may mga batas nga na para sa mga kasambahay, magsasaka, atbp. pero halos lahat naman ng mga taong kabilang sa dito, wala naman gaanong alam tungkol dito kasi naka-English yung batas natin. Hindi pa nila maintindihan. Yung layunin na makapagpaalam sa mga taong makikinabang sana sa mga ito, wala na agad dahil doon. So ayun, hindi ko alam kung napakita ko ba yung kahalagahan ng pag-aral nito kahit sa kolehiyo haha sana makuha kahit konti lang hindi ako magaling makipagdebate tungkol sa isyu na ito nakakatrigger ka lang (sinubukan ko rin maging purong Filipino yung sasabihin ko dito, pero kita mo naman di ba haha sana marealize mo yun hehe peace!)"
6,6.0,0.478,"guy, girl, love, admu, play, study, cute, wanna, hear, country","Why do people get mad at the fact that basketball is the most popular sport in the country? They mention how we should just focus on sports like billiards and football or some other height appropriate game. They say that basketball programs should be defunded and that we should literally just play for the medals. Heres the reality. You arent going to convince any young basketball player to play football by defunding basketball programs. These people pl ay basketball because its an entertaining and intense sport that they enjoy playing, of course they want to win but do you really think that forcing young players to play football instead is going to produce the next Messi? No. People play basketball because its what they want, its what entertains them. Watching LeBron drop 51, or Shaq destroying the rim after dunking hard, do you really think you can convince a young kid mesmerized by these videos to stop playing basketball and play tennis, a sport he does not even love? These kids arent playing for the gold medals, they play because they want it. This country wont produce the next Jordan any time soon, or the next Wade, hell it wouldnt even produce just an Isaiah Thomas tier player, but Id rather be in country that encourages people to enjoy what they want and play what they want rather than forcing kids to play a sport they dont give a shit about, just for some medals that other Filipinos brag about. Promote football more, promote tennis more, promote all these lesser popular sports in our country, but remember that bringing down basketball wont turn us into a FIFA competitor to Brazil, you cant convince a person who loves Kyrie Irvings insane ball handling skills to love football just because his countrymen want him to earn a gold medal. A gold medal which he doesnt give a shit about, a medal only there so that Filipinos can brag about being Filipinos."
7,7.0,0.8217,"good, give, car, big, change, high, date, improve, find, end","The Nissan Skyline GT-R R34 Z-tune is regarded as one of NISMOs finest creations to date! As a tribute to the rich history of the Skyline GT-R (which ended in 2002), NISMO purchased 18 used R34 GT-R V-specs, all of which were to be reworked by the company into one of its most impressive creations to date. NISMO extensively rebuilt the cars stock RB26 engine, by replacing virtually all of the engines original components with variants used by NISMO for their race cars. Special race-car engine treatment like balancing and boring and stroking were done to the RB, increasing its displacement to 2.8L. Even the turbos werent spared, as the ones used by the normal V-spec were replaced with IHI turbos that were set to 1.5 boost. With the engine being complimented by the replaced intercooler and exhaust system, the Z-tune is capable of producing 510hp! The rest of the car was also given a meticulous makeover by NISMO. The cars ability to handle the horses without slip was improved by the utilization of both a twin-plate clutch and an LSD. Parts that were located towards the front of the car such as the front bumper, strut bar, and hood were replaced with carbon fiber variants so as to reduce the weight of the front half of the car. A lot of these CF parts were even spot welded by NISMO to improve the rigidity of the chassis! To lessen body-roll and improve handling, NISMO also replaced the stock V-spec springs with stiffer adjustable ones and adjusted them, giving the car a stiffer suspension, not too good for the streets, but perfect for the track. Special Brembo brakes ensured that the Z-tune had better braking performance, while downforce was improved by the addition of a front splitter and a change of rear spoiler, increasing both front and rear downforce respectively. All of these improvements are not all for show though: The Z-tune was able to set an insane Tsukuba lap time of 1:01, making it the 4th fastest road-legal production car on the track! Nissan originally planned to create 19 examples of this beast, but they ended up producing only 18 Ztunes. This low number of production,combined with this car being a good showcase of NISMOs tuning capabilities, drives the prices of the R34 Z-tune extremely high. From its original selling price of only 17 million yen when new (around $100k), the current selling price of these cars are now in excess of $500k (this particular example selling for $510k!) Sadly, this very high value makes it a collectors item today, which is why youre more likely to find the car now in some rich guys climate-controlled garage instead of the tracks where this beast truly belongs."
