In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('hony_data.csv')

In [3]:
del df['image_id']

In [4]:
df = df.drop_duplicates('image_url')

In [5]:
df.head()

Unnamed: 0,image_url,post,no_likes,no_comments,image_labels,image_text,web_entities
0,https://instagram.fftw1-1.fna.fbcdn.net/vp/e49...,“I didn’t get accepted into any of the univers...,493.6k,3806,footwear fashion accessory jeans shoulder shoe...,,Casa Adela Brandon Stanton New York City Bingh...
1,https://instagram.fftw1-1.fna.fbcdn.net/vp/2ea...,"""I'm trying to live my life without conflict s...",268k,2714,sitting vehicle temple headgear street health ...,,Brandon Stanton New York City India Black Pant...
2,https://instagram.fftw1-1.fna.fbcdn.net/vp/158...,“I was a full time housewife. I kept mostly to...,411.7k,4103,woman facial expression lady smile snapshot gi...,,New York City Felines of New York: A Glimpse I...
3,https://instagram.fftw1-1.fna.fbcdn.net/vp/053...,"""I don't know how old I am."" (Mumbai, India)",451.4k,5207,face facial expression yellow head smile eye t...,,India Humans of New York Humans of New York: S...
4,https://instagram.fftw1-1.fna.fbcdn.net/vp/808...,“I resented my mother for the longest time. Sh...,237.2k,1201,hand finger nail arm jewellery ring,,New York City Sadhana H. Varshney Mumbai Human...


In [6]:
def get_location(x):
    pattern = "\(\D*\)" # non-digits within parentheses (excludes instances with multiple posts, e.g. "Caption...(2/3)")
    match = re.findall(pattern, x)
    if len(match) == 0:
        return ("NYC, USA")
    else:
        return match[len(match)-1].lstrip("(").rstrip(")")

In [7]:
df['location'] = df['post'].map(get_location)
df['location'].value_counts()

NYC, USA                                                              326
Mumbai, India                                                          29
Jaipur, India                                                          27
São Paulo, Brazil                                                      25
Rio de Janeiro, Brazil                                                 20
St. Petersburg, Russia                                                 19
Bogotá, Colombia                                                       18
Udaipur, India                                                         18
Moscow, Russia                                                         15
Dhaka, Bangladesh                                                      15
Santiago, Chile                                                        13
Jakarta, Indonesia                                                     11
Montevideo, Uruguay                                                     9
Cordoba, Argentina                    

#### Get all likes in thousands

In [8]:
def likes_num(x):
    match = re.findall('k',x)
    if len(match) == 0:
        return int(float(x.rstrip('m'))*1000)
    else: 
        return int(float(x.rstrip('k')))

In [9]:
df['likes'] = df['no_likes'].map(lambda x: likes_num(x))
df['comments'] = df['no_comments'].map(lambda x: x.replace(",", ""))
df.head()

Unnamed: 0,image_url,post,no_likes,no_comments,image_labels,image_text,web_entities,location,likes,comments
0,https://instagram.fftw1-1.fna.fbcdn.net/vp/e49...,“I didn’t get accepted into any of the univers...,493.6k,3806,footwear fashion accessory jeans shoulder shoe...,,Casa Adela Brandon Stanton New York City Bingh...,"Mumbai, India",493,3806
1,https://instagram.fftw1-1.fna.fbcdn.net/vp/2ea...,"""I'm trying to live my life without conflict s...",268k,2714,sitting vehicle temple headgear street health ...,,Brandon Stanton New York City India Black Pant...,"Mumbai, India",268,2714
2,https://instagram.fftw1-1.fna.fbcdn.net/vp/158...,“I was a full time housewife. I kept mostly to...,411.7k,4103,woman facial expression lady smile snapshot gi...,,New York City Felines of New York: A Glimpse I...,"Mumbai, India",411,4103
3,https://instagram.fftw1-1.fna.fbcdn.net/vp/053...,"""I don't know how old I am."" (Mumbai, India)",451.4k,5207,face facial expression yellow head smile eye t...,,India Humans of New York Humans of New York: S...,"Mumbai, India",451,5207
4,https://instagram.fftw1-1.fna.fbcdn.net/vp/808...,“I resented my mother for the longest time. Sh...,237.2k,1201,hand finger nail arm jewellery ring,,New York City Sadhana H. Varshney Mumbai Human...,"Mumbai, India",237,1201


In [10]:
df['likes_decile'] = pd.cut(df['likes'],10, labels = False) # even spacing as opposed to qcut 
# 1 represents fewest likes
df['likes_decile'].value_counts()

1    267
0    205
2     85
3     28
4      8
6      3
9      1
Name: likes_decile, dtype: int64

In [11]:
# bin values
pd.cut(df['likes'],10).value_counts()

(146.0, 252.0]     267
(38.94, 146.0]     205
(252.0, 358.0]      85
(358.0, 464.0]      28
(464.0, 570.0]       8
(676.0, 782.0]       3
(994.0, 1100.0]      1
(888.0, 994.0]       0
(782.0, 888.0]       0
(570.0, 676.0]       0
Name: likes, dtype: int64

### Topic Modeling

In [12]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
%matplotlib inline
import math as m
from sklearn.cluster import KMeans
import numpy as np
from sklearn import manifold

In [13]:
def clean(doc):
    doc = re.sub(r'[^\w\s]*', '', doc) 
    doc = re.sub(r'[\s]+', ' ', doc)
    doc = doc.lower().strip()
    return doc

df['words'] = df['post'].map(lambda x: clean(x).split())
clean_docs = df['words'].tolist()
# clean_docs

#### Remove stopwords - probably want to add locations to the list because those still show up in words 

In [27]:
stops = list(stopwords.words('english'))
addtl_sw = ['one','got',"im","ive","id",'even','like','going'] 

def remove_stopwords(doc):
    sw_doc = list()
    for token in doc:
        if not token in stops and not token in addtl_sw:
            sw_doc.append(token)
    return sw_doc
            
df['tokens'] = df['words'].map(lambda x: remove_stopwords(x))
sw_token_docs = df['tokens'].tolist()


In [15]:
del df['no_likes'], df['no_comments']
df.head()

Unnamed: 0,image_url,post,image_labels,image_text,web_entities,location,likes,comments,likes_decile,words,tokens
0,https://instagram.fftw1-1.fna.fbcdn.net/vp/e49...,“I didn’t get accepted into any of the univers...,footwear fashion accessory jeans shoulder shoe...,,Casa Adela Brandon Stanton New York City Bingh...,"Mumbai, India",493,3806,4,"[i, didnt, get, accepted, into, any, of, the, ...","[didnt, get, accepted, universities, wanted, e..."
1,https://instagram.fftw1-1.fna.fbcdn.net/vp/2ea...,"""I'm trying to live my life without conflict s...",sitting vehicle temple headgear street health ...,,Brandon Stanton New York City India Black Pant...,"Mumbai, India",268,2714,2,"[im, trying, to, live, my, life, without, conf...","[im, trying, live, life, without, conflict, do..."
2,https://instagram.fftw1-1.fna.fbcdn.net/vp/158...,“I was a full time housewife. I kept mostly to...,woman facial expression lady smile snapshot gi...,,New York City Felines of New York: A Glimpse I...,"Mumbai, India",411,4103,3,"[i, was, a, full, time, housewife, i, kept, mo...","[full, time, housewife, kept, mostly, shy, per..."
3,https://instagram.fftw1-1.fna.fbcdn.net/vp/053...,"""I don't know how old I am."" (Mumbai, India)",face facial expression yellow head smile eye t...,,India Humans of New York Humans of New York: S...,"Mumbai, India",451,5207,3,"[i, dont, know, how, old, i, am, mumbai, india]","[dont, know, old, mumbai, india]"
4,https://instagram.fftw1-1.fna.fbcdn.net/vp/808...,“I resented my mother for the longest time. Sh...,hand finger nail arm jewellery ring,,New York City Sadhana H. Varshney Mumbai Human...,"Mumbai, India",237,1201,1,"[i, resented, my, mother, for, the, longest, t...","[resented, mother, longest, time, always, affe..."


In [20]:
import logging, gensim
from gensim import corpora, models, similarities
from gensim.corpora.dictionary import Dictionary

def get_corpus(tokens):
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(doc) for doc in tokens]
    return corpus

In [25]:
dictionary = corpora.Dictionary(sw_token_docs)
corpus = [dictionary.doc2bow(doc) for doc in sw_token_docs]

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
lda = gensim.models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=10, update_every=1, passes=20)

2018-03-01 10:35:37,241 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-03-01 10:35:37,345 : INFO : built Dictionary(5366 unique tokens: [u'inning', u'wetlands', u'exams', u'todays', u'chile']...) from 597 documents (total 38136 corpus positions)
2018-03-01 10:35:37,442 : INFO : using symmetric alpha at 0.1
2018-03-01 10:35:37,444 : INFO : using symmetric eta at 0.000186358553858
2018-03-01 10:35:37,447 : INFO : using serial LDA version on this node
2018-03-01 10:35:37,700 : INFO : running online (multi-pass) LDA training, 10 topics, 20 passes over the supplied corpus of 597 documents, updating model once every 597 documents, evaluating perplexity every 597 documents, iterating 50x with a convergence threshold of 0.001000
2018-03-01 10:35:40,527 : INFO : -10.831 per-word bound, 1821.2 perplexity estimate based on a held-out corpus of 597 documents with 38136 words
2018-03-01 10:35:40,528 : INFO : PROGRESS: pass 0, at document #597/597
2018-03-01 10:35:41,429 : INFO 

2018-03-01 10:35:53,579 : INFO : topic #9 (0.100): 0.011*"time" + 0.007*"today" + 0.006*"want" + 0.006*"new" + 0.006*"first" + 0.006*"lost" + 0.005*"go" + 0.005*"make" + 0.005*"ill" + 0.005*"microfashion"
2018-03-01 10:35:53,580 : INFO : topic diff=0.203769, rho=0.377964
2018-03-01 10:35:55,408 : INFO : -7.774 per-word bound, 218.9 perplexity estimate based on a held-out corpus of 597 documents with 38136 words
2018-03-01 10:35:55,409 : INFO : PROGRESS: pass 6, at document #597/597
2018-03-01 10:35:56,024 : INFO : topic #7 (0.100): 0.008*"time" + 0.008*"didnt" + 0.008*"never" + 0.008*"years" + 0.007*"life" + 0.007*"get" + 0.007*"told" + 0.006*"us" + 0.006*"people" + 0.006*"could"
2018-03-01 10:35:56,026 : INFO : topic #5 (0.100): 0.011*"always" + 0.009*"school" + 0.007*"every" + 0.007*"life" + 0.006*"brazil" + 0.006*"dont" + 0.006*"get" + 0.006*"people" + 0.006*"work" + 0.005*"time"
2018-03-01 10:35:56,028 : INFO : topic #0 (0.100): 0.010*"dont" + 0.008*"know" + 0.007*"didnt" + 0.007*"

2018-03-01 10:36:08,889 : INFO : PROGRESS: pass 12, at document #597/597
2018-03-01 10:36:09,370 : INFO : topic #8 (0.100): 0.012*"always" + 0.008*"years" + 0.007*"told" + 0.007*"still" + 0.006*"didnt" + 0.006*"make" + 0.006*"dont" + 0.006*"time" + 0.006*"mom" + 0.006*"could"
2018-03-01 10:36:09,371 : INFO : topic #6 (0.100): 0.010*"always" + 0.009*"never" + 0.008*"time" + 0.008*"people" + 0.007*"dont" + 0.007*"love" + 0.007*"get" + 0.006*"feel" + 0.005*"lot" + 0.005*"theyre"
2018-03-01 10:36:09,372 : INFO : topic #2 (0.100): 0.012*"get" + 0.011*"dont" + 0.007*"time" + 0.006*"people" + 0.006*"lot" + 0.006*"think" + 0.005*"started" + 0.005*"would" + 0.005*"shes" + 0.005*"money"
2018-03-01 10:36:09,374 : INFO : topic #1 (0.100): 0.012*"want" + 0.010*"make" + 0.010*"ill" + 0.010*"told" + 0.009*"years" + 0.008*"time" + 0.008*"money" + 0.008*"school" + 0.007*"feel" + 0.007*"made"
2018-03-01 10:36:09,375 : INFO : topic #7 (0.100): 0.009*"time" + 0.008*"didnt" + 0.008*"years" + 0.008*"never" 

2018-03-01 10:36:24,142 : INFO : topic #4 (0.100): 0.008*"stories" + 0.007*"thought" + 0.006*"everything" + 0.006*"theyre" + 0.005*"fight" + 0.005*"never" + 0.005*"two" + 0.005*"war" + 0.005*"cant" + 0.005*"time"
2018-03-01 10:36:24,143 : INFO : topic #5 (0.100): 0.011*"always" + 0.009*"school" + 0.007*"brazil" + 0.007*"every" + 0.007*"life" + 0.006*"people" + 0.006*"get" + 0.006*"time" + 0.005*"work" + 0.005*"day"
2018-03-01 10:36:24,144 : INFO : topic #8 (0.100): 0.012*"always" + 0.008*"years" + 0.007*"told" + 0.007*"still" + 0.006*"didnt" + 0.006*"make" + 0.006*"dont" + 0.006*"time" + 0.006*"could" + 0.006*"mom"
2018-03-01 10:36:24,145 : INFO : topic diff=0.010722, rho=0.223607
2018-03-01 10:36:26,063 : INFO : -7.733 per-word bound, 212.8 perplexity estimate based on a held-out corpus of 597 documents with 38136 words
2018-03-01 10:36:26,064 : INFO : PROGRESS: pass 19, at document #597/597
2018-03-01 10:36:26,509 : INFO : topic #6 (0.100): 0.010*"always" + 0.008*"never" + 0.008*"peo

In [26]:
t=0
for i in lda.show_topics(num_topics=5, num_words=15, log=False, formatted=True):
    print "Topic # ", t , i
    t = t + 1

Topic #  0 (5, u'0.011*"always" + 0.009*"school" + 0.007*"brazil" + 0.007*"every" + 0.007*"life" + 0.006*"people" + 0.006*"get" + 0.006*"time" + 0.005*"work" + 0.005*"day" + 0.005*"de" + 0.005*"dont" + 0.005*"first" + 0.005*"still" + 0.005*"janeiro"')
Topic #  1 (6, u'0.010*"always" + 0.008*"never" + 0.008*"people" + 0.008*"time" + 0.007*"dont" + 0.007*"love" + 0.007*"get" + 0.006*"feel" + 0.005*"lot" + 0.005*"theyre" + 0.005*"would" + 0.005*"things" + 0.004*"want" + 0.004*"parents" + 0.004*"know"')
Topic #  2 (8, u'0.012*"always" + 0.008*"years" + 0.007*"told" + 0.007*"still" + 0.006*"didnt" + 0.006*"make" + 0.006*"dont" + 0.006*"time" + 0.006*"could" + 0.006*"mom" + 0.005*"school" + 0.005*"people" + 0.005*"us" + 0.005*"know" + 0.005*"never"')
Topic #  3 (4, u'0.008*"stories" + 0.007*"thought" + 0.006*"everything" + 0.006*"theyre" + 0.005*"fight" + 0.005*"never" + 0.005*"two" + 0.005*"war" + 0.005*"cant" + 0.005*"read" + 0.005*"time" + 0.005*"get" + 0.005*"told" + 0.005*"cancer" + 0.0