In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus
import json


Slow version of gensim.models.doc2vec is being used


In [2]:
data = pd.read_csv("stumbleupon.tsv", sep='\t')
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))
data['recipe'] = data['title'].str.contains('recipe')

cv = CountVectorizer(binary=False, #calc frequency rather than binary 
                     stop_words='english', #ignore english words like or, if etc
                     min_df=3) #only include terms which appear 3 or more times in the document.

docs = cv.fit_transform(data.body.dropna())
# Build a mapping of numerical ID to word
id2word = dict(enumerate(cv.get_feature_names())) # mapping an ID to each term

In [9]:
data.head(1)

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,title,body,recipe
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,0,5424,170,8,0.152941,0.07913,0,IBM Sees Holographic Calls Air Breathing Batte...,A sign stands outside the International Busine...,False


In [3]:
# First we convert our word-matrix into gensim's format
corpus = Sparse2Corpus(docs, documents_columns = False)

# Then we fit an LDA model - this is the algorithm which calculates the probability of a term belonging to a topic

lda_model = LdaModel(corpus=corpus, id2word=id2word, 
                     num_topics=50) # stop at 50 topics.


In [12]:
num_topics = 25
num_words_per_topic = 5
for ti, topic in enumerate(lda_model.show_topics(num_topics = 10, num_words = 5)):
    print("Topic: %d" % (ti))
    print(topic)
    print()


Topic: 0
(24, '0.009*"chocolate" + 0.008*"truffle" + 0.007*"truffles" + 0.007*"news" + 0.007*"gifts"')

Topic: 1
(26, '0.034*"flashvars" + 0.015*"dough" + 0.009*"minutes" + 0.007*"cup" + 0.007*"roll"')

Topic: 2
(35, '0.005*"best" + 0.004*"just" + 0.004*"time" + 0.004*"like" + 0.004*"muscle"')

Topic: 3
(32, '0.011*"cancer" + 0.009*"health" + 0.007*"news" + 0.005*"people" + 0.005*"body"')

Topic: 4
(47, '0.022*"la" + 0.022*"el" + 0.018*"en" + 0.014*"que" + 0.010*"pretzel"')

Topic: 5
(45, '0.020*"hosting" + 0.016*"moore" + 0.010*"hacks" + 0.009*"fuck" + 0.008*"make"')

Topic: 6
(42, '0.024*"com" + 0.023*"div" + 0.018*"http" + 0.015*"online" + 0.014*"news"')

Topic: 7
(38, '0.020*"skin" + 0.015*"scarf" + 0.007*"make" + 0.006*"tie" + 0.006*"neck"')

Topic: 8
(37, '0.016*"fillet" + 0.008*"pom" + 0.008*"scallops" + 0.006*"tested" + 0.006*"worm"')

Topic: 9
(21, '0.017*"recipe" + 0.015*"lemon" + 0.012*"lime" + 0.011*"biscuits" + 0.011*"lobster"')



Observation:  
25 topics returned, each contains 5 words and the probability that the word belongs to the topic, if the document contains all words then it'll return the maximum probability.

In [5]:
l = lda_model.get_document_topics(corpus)

In [6]:
i = 0
for doc in l:
    i += 1
    print(doc)
    if i > 10:
        break

[(3, 0.01320877271084806), (4, 0.011733247253052024), (12, 0.053785225931479434), (19, 0.048785143579318083), (23, 0.49947552670229611), (28, 0.010046653216152541), (32, 0.032617863119320321), (39, 0.21184244281229925), (40, 0.087458833867873759), (44, 0.024867294634880865)]
[(23, 0.14167976619333555), (40, 0.85352604488550854)]
[(11, 0.014383135928050831), (23, 0.056518327709290092), (32, 0.92181171465723133)]
[(4, 0.16504586198648769), (8, 0.063125473366467927), (12, 0.53377635626238851), (16, 0.012443193335865717), (19, 0.12875649611365714), (32, 0.081927711396042713), (40, 0.011457165603605124)]
[(3, 0.037318444488585664), (12, 0.64417406994725002), (18, 0.017937343768031834), (25, 0.092874381653808014), (29, 0.05595883553816209), (40, 0.12951790052519141)]
[(4, 0.18472032165310304), (9, 0.16469202817766285), (23, 0.11009861261283749), (25, 0.031588662597556952), (32, 0.31566749284178713), (40, 0.18850169932135302)]
[(0, 0.027777156045999344), (3, 0.22226450713645929), (9, 0.012103

This tells us that 3 (which relates to a topic) 

In [13]:
from gensim.models.word2vec import Word2Vec

# Setup the body text
text = data.body.dropna().map(lambda x: x.split())
#from gensim.models import Word2Vec
model = Word2Vec(text, size=100, #number of dimensions/ words
                 window=5, # how many words either side to gather the context 
                 min_count=5, 
                 workers=4)
model.most_similar(positive=['cookie', 'brownie']) ## this will return the similarity between the document and these two words



KeyboardInterrupt: 

In [None]:
model.most_similar(# most similar returns default top 10 
    positive=['man']) # positively correlated 

In [None]:
model.most_similar_cosmul(positive=['man'])

In [None]:
model.most_similar(negative=['man']) # return negatively correlated words.

In [None]:
model.wv['man']# returns the vector nb not easily interpreted.

In [None]:
model.wv.similarity('woman', 'man') # find the similarity between two words, this returns the angle between two words.

## solo practice using twitter

In [19]:
tweets = [tweet for tweet in open ('captured-tweets.txt','r')]

UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 1311: character maps to <undefined>

In [18]:
## I couldn't get this to work as I haven't managed to import C++ which is a prerequisite for this.
import spacy
from spacy.en import English

ImportError: DLL load failed: The specified procedure could not be found.

In [None]:
# spacy is used for pre-processing and traditional NLP
import spacy
from spacy.en import English

# Gensim is used for LDA and word2vec
from gensim.models.word2vec import Word2Vec

# Write a function that can take a take a sentence parsed by `spacy` and 
# identify if it mentions a company named 'Google'. 
# Remember, `spacy` can find entities and codes them as `ORG` if they are a company.


# Write a function that can take a sentence parsed by `spacy` 
# and return the verbs of the sentence (preferably lemmatized)

# Write a function that identifies countries - HINT: the entity label for 
# countries is GPE (or GeoPolitical Entity)

if __name__ == '__main__':
    # Loading the tweet data
    tweets = [tweet for tweet in open('../../assets/dataset/captured-tweets.txt', 'r')]

    # Setting up spacy
    nlp_toolkit = English()
