# Importing parsed Instagram Post Data for analysis

In [1]:
import pandas as pd
import sys
sys.path.append('../')
from app.src.data_cleaning.emoji_cleaner import remove_emojis
from app.src.data_cleaning.custom_text_preprocessor import preprocess

#Load input data
data = pd.read_csv('../data/processed/insta_post_1.csv')
comments_data = data[['edge_media_to_parent_comment__edges__node__text']]
comments_document = comments_data['edge_media_to_parent_comment__edges__node__text']
total_docs = comments_document # Previously involved adding all the text documents together to make one giant blob of text

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/robertanizoba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Input Data Cleaning

In [2]:
nan_string_value = 'nan'
non_null_comment_docs = list(filter(lambda x: str(x) != nan_string_value, total_docs))
non_empty_docs = filter(None, map(remove_emojis, non_null_comment_docs))
processed_docs = []

for doc in non_empty_docs:
    processed_docs.append(preprocess(doc))

# Data Modelling

In [4]:
from gensim import corpora, models

dictionary = corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=30)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

#LDA - Bag-Of-Words
lda_model = models.LdaMulticore(bow_corpus, num_topics=2, id2word=dictionary, passes=2, workers=2)

unseen_document = '2pac is in the house, all hands on deck'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 3)))


Score: 0.6605111360549927	 
Topic: 0.077*"2pac" + 0.057*"tupac" + 0.056*"shit"

Score: 0.3394888937473297	 
Topic: 0.052*"2pac" + 0.044*"miss" + 0.044*"brotha"
