In [7]:
from nltk.tokenize import word_tokenize
import json
import pprint
import nltk

import re
from nltk.corpus import stopwords
import string
from collections import Counter

from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import pyLDAvis.gensim


def remove_non_utf8(text):
    text = text.replace('\n', '').replace('\r', '')
    return ''.join([i if ord(i) < 128 else ' ' for i in text])


punctuation = list(string.punctuation)
other_list = ['update', 'create', 'rule', 'hey', 'hello', 'build', 'everything', 'change', 'system', 'idea', 'product',
              'feel', 'grow', 'forget', 'follow', 'follower', 'twitter', 'number', 'know', 'people', 'try', 'get',
              'kill', 'listen', 'way', 'need', 'think', 'thank', 'thanks', 'look', 'everyone', 'address', 'place',
              'man', 'boy', 'good', 'great', 'new', 'catch', 'account', 'die', 'help', 'thing', 'general', 'work',
              'job', 'shake', 'faster', 'quicker', 'see', 'watch']
number_list = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']
stop = other_list + stopwords.words('english') + punctuation + ['rt', 'via'] + ['west', 'east', 'south',
                                                                                'north'] + number_list


# print(punctuation)
# print(stop)

def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names


def getNameEntity(sample):
    #     sample='RT @MrPeaceOfShit: Anyone in London that doesn\'t walk fast is an enemy of the state &amp; a threat to national security https://t.co/XXC65EGN12'
    sentences = nltk.sent_tokenize(sample)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    entity_names = []
    for tree in chunked_sentences:
        # Print results per sentence
        # print extract_entity_names(tree)
        entity_names.extend(extract_entity_names(tree))

    return set(entity_names)


def process_text(sample):
    """Remove emoticons, numbers etc. and returns list of cleaned tweets."""
    #     data = pull_tweets()
    regex_remove = "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^RT|http.+?"
    stripped_text = re.sub(regex_remove, '', sample).strip()

    return stripped_text


emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""

regex_str = [
    emoticons_str,
    r'<[^>]+>',  # HTML tags
    r'(?:@[\w_]+)',  # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",  # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',  # URLs

    r'(?:(?:\d+,?)+(?:\.?\d+)?)',  # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])",  # words with - and '
    r'(?:[\w_]+)',  # other words
    r'(?:\S)'  # anything else
]

emoticon_list = [':-)', ':)', ':-]', ':]', ':-3', ':3', ':->',
                 ':>', '8-)', '8)', ':-}', ':}', ':o)', ':c)', ':^)', '=]', '=)',
                 ':‑D', ':D', '8‑D', '8D', 'x‑D', 'xD', 'X‑D', 'XD', '=D', '=3', 'B^D',
                 ':-))', ':‑(', ':(', ':‑c', ':c', ':‑<', ':<', ':‑[', ':[', ':-||',
                 '>:[', ':{', ':@', '>:(', ':\'‑(', ':\'(', ':\'‑)', ':\')', 'D‑\':',
                 'D:<', 'D:', 'D8', 'D;', 'D=', 'DX', ':‑O', ':O', ':‑o', ':o', ':-0',
                 '8‑0', '>:O', ':-*', ':*', ':×', ';‑)', ';)', '*-)', '*)', ';‑]', ';]',
                 ';^)', ':‑,', ';D', ':‑/', ':/', ':‑.', '>:\\', '>:/', ':\\',
                 '=/', '=\\', ':L', '=L', ':S', ':‑P', ':P', 'X‑P', 'XP', 'x‑p',
                 'xp', ':‑p', ':p', ':‑Þ', ':Þ', ':‑þ', ':þ', ':‑b', ':b', 'd:', '=p',
                 '>:P', ':‑|', ':|', ':$', ':‑X', ':X', ':‑#', ':#', ':‑&', ':&']
negative_contraction_list = ['aren\'t', 'can\'t', 'couldn\'t', 'daren\'t', 'didn\'t', 'doesn\'t', 'don\'t', 'hasn\'t',
                             'haven\'t', 'hadn\'t', 'isn\'t', 'mayn\'t', 'mightn\'t', 'mustn\'t', 'needn\'t',
                             'oughtn\'t', 'shan\'t', 'shouldn\'t', 'wasn\'t', 'weren\'t', 'won\'t', 'wouldn\'t']


def stemming(s):
    lemma = nltk.wordnet.WordNetLemmatizer()
    return lemma.lemmatize(s.lower())


month_list = ['lot', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October',
              'November', 'December']
time_list = month_list + ['always', 'usually', 'often'] + ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
                                                           'Saturday', 'Sunday'] + ['before', 'after', 'during',
                                                                                    'morning', 'afternoon', 'midday',
                                                                                    'noon', 'night', 'today', 'evening',
                                                                                    'yesterday', 'tomorrow', 'day',
                                                                                    'sunset', 'sunrise', 'future',
                                                                                    'past', 'present', 'then', 'now',
                                                                                    'when', 'early', 'late', 'soon',
                                                                                    'days', 'weeks', 'months', 'years',
                                                                                    'midnight', 'time', 'clock',
                                                                                    'hours', 'minutes', 'seconds',
                                                                                    'o\'clock', 'a.m.', 'p.m.',
                                                                                    'centuries', 'decades', 'seasons']

# Stemming time related list
for i in range(len(time_list)):
    time_list[i] = stemming(time_list[i])

stop = stop + time_list + emoticon_list + negative_contraction_list
# print("time_list:",time_list)

tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^' + emoticons_str + '$', re.VERBOSE | re.IGNORECASE)


def tokenize(s):
    return tokens_re.findall(s)


def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens


is_noun = lambda pos: pos[:2] == 'NN'


def printAll(data):
    result_dict = {}
    english = set(w.lower() for w in nltk.corpus.words.words())
    graph_data_dict={}
    typical_user_id_list = [100057597, 100595096, 1012917350, 1014120596, 1014916676, 1017078158, 19499747, 328607423,
                            34465303, 358464905]
#     ldamodel_list=[]
#     corpus_list=[]
#     dictionary_list=[]
    one_missing_id = [358464905]
    one_pic_id=[100057597]
    for item in typical_user_id_list:
        item = str(item)
        result_list = []
        for num in data[item]:
#             print(num, "Data: ", data[item][num])
#             print(num, "Preprocessing: ", preprocess(data[item][num]))
            list_term = [term for term in preprocess(remove_non_utf8(data[item][num])) if
                         term.lower() not in stop and not term.startswith('http') and not term.startswith(
                             '@') and not term.startswith('#')]
            lemma = nltk.wordnet.WordNetLemmatizer()
            temp_list = [lemma.lemmatize(word.lower()) for (word, pos) in nltk.pos_tag(list_term) if
                         is_noun(pos) and word.lower() not in stop and (
                             len(word) >= 3 or word.lower() == 'uk') and '\'' not in word]
            another_temp_list = [correct for correct in temp_list if correct in english and correct not in stop]
            if len(another_temp_list) >= 1:
                result_list.append(another_temp_list)
            # print(num, "TempLIST!!!!!:", temp_list)
            # print(num, "another_temp_list!!!!!:", another_temp_list)
        # print(result_list)
        # print(item,Counter(result_list),type(Counter(result_list)))
        # result_dict[item] = result_list
        if (len(result_list) > 1):
            dictionary = corpora.Dictionary(result_list)
            corpus = [dictionary.doc2bow(list) for list in result_list]
            ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=20)
            print(ldamodel.print_topics(num_topics=3, num_words=3))

            topiclist = []
            for i in range(3):
                templist = [item[0] for item in ldamodel.show_topic(i, 3)]
                topiclist.append(templist)
                print(ldamodel.show_topic(i, 3))

            result_dict[item] = topiclist
#             ldamodel_list.append(ldamodel)
#             corpus_list.append(corpus)
#             dictionary_list.append(dictionary)
            graph_data_dict[item]={}
            graph_data_dict[item]['ldamodel']=ldamodel
            graph_data_dict[item]['dictionary']=dictionary
            graph_data_dict[item]['corpus']=corpus
            # followers_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
            # pyLDAvis.display(followers_data)
        else:
            print("Too few")


#     print(result_dict)



    return graph_data_dict



# with open('twitter_text.json') as data_file:
#     data = json.load(data_file)

with open('ofilename.json') as data_file:
    data = json.load(data_file)

# with open('topicLDA.json') as data_file:
#     LDAdata = json.load(data_file)

# findMissing(LDAdata)
graph_data_dict=printAll(data)
# printItem(str(100057597),data)


[(0, '0.022*"barcelona" + 0.020*"con" + 0.014*"sin"'), (1, '0.149*"photo" + 0.025*"station" + 0.021*"para"'), (2, '0.030*"con" + 0.029*"del" + 0.022*"la"')]
[('barcelona', 0.021641709680671174), ('con', 0.019732547671360161), ('sin', 0.013804652009778871)]
[('photo', 0.14887558810166077), ('station', 0.02486530982409588), ('para', 0.021373728209845705)]
[('con', 0.02967604353218881), ('del', 0.028957525514197967), ('la', 0.021610301687787957)]
[(0, '0.017*"life" + 0.013*"girl" + 0.011*"love"'), (1, '0.012*"heart" + 0.010*"cat" + 0.010*"tube"'), (2, '0.028*"love" + 0.012*"song" + 0.010*"life"')]
[('life', 0.016525466959952394), ('girl', 0.013269450913841834), ('love', 0.010717583808285221)]
[('heart', 0.012050047966419184), ('cat', 0.010076315664616791), ('tube', 0.0099901531174252274)]
[('love', 0.027800200073418904), ('song', 0.012397160420291152), ('life', 0.010171050741428407)]
[(0, '0.040*"enjoy" + 0.024*"ana" + 0.017*"wallah"'), (1, '0.261*"birthday" + 0.099*"love" + 0.032*"beauti

In [11]:
user_graph_data_dict={}
for userid in graph_data_dict:
    print(userid)
    user_graph_data_dict[userid]= pyLDAvis.gensim.prepare(graph_data_dict[userid]['ldamodel'],graph_data_dict[userid]['corpus'], graph_data_dict[userid]['dictionary'])

328607423
1012917350
19499747
1014120596
34465303
100057597
100595096
1017078158
1014916676


In [13]:
pyLDAvis.display(user_graph_data_dict['1014916676'])