In [1]:
import statsmodels.api as sm
import pandas as pd
import re,string
import nltk
from patsy import dmatrices
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
import matplotlib.pyplot as plt
import warnings
%pylab inline
import numpy as np
from sklearn.manifold import MDS
from sklearn.metrics import euclidean_distances

Populating the interactive namespace from numpy and matplotlib


## Setting up data

In [2]:
df = pd.read_csv('insta_caption.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Engagement_score,photo,hashtags,happypills,pills,pharma,bigpharma,1,2,...,tree,turnthetide,amish,farm,ngari,zanda,idg2018,dayofthegirl,unicef,educategirls
0,0,-0.63576,1,"['happypills', 'pills', 'pharma', 'bigpharma']",1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,-0.075393,1,"['1', '2']",0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,2,-0.381748,1,"['FollowMe', 'Madagascar', 'Enoughness', 'natu...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1.064558,1,"['snowstorm', 'penguin', 'antarctica']",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0.036183,1,"['whales', 'humpbackwhales', 'parenting', 'pla...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.shape

(997, 1239)

In [5]:
insta_captions = pd.DataFrame(df[['hashtags','Engagement_score']], index= df.index)

In [6]:
insta_captions.dropna(axis=0, inplace=True)

In [7]:
#pros.dropna(axis=0, inplace=True)
#cons.dropna(axis=0, inplace=True)

stop = set(stopwords.words('english'))
punc = string.punctuation

def clean_tokenize(s):
        s = re.sub(r'[^\w\s]', '',s.lower())
        return([word for word in word_tokenize(s) if word not in stop if word not in punc])
def get_lemma(word):
        lemma = wordnet.morphy(word)
        if lemma is None:
            return word
        else:
            return lemma
        def get_lemma2(word):
            return WordNetLemmatizer().lemmatize(word)
def lemmatize(my_tokens):
    """Function to enable data preparation for topic modeling using LDA, 
    includes removing stop words, lemmatization, tokenization"""
    my_tokens = [get_lemma(token) for token in my_tokens]
    return my_tokens

# def dataprep(text_data_df):
#     pros = pd.DataFrame(text_data_df['Pros'], index= text_data_df.index)
#     cons = pd.DataFrame(text_data_df['Cons'],  index= text_data_df.index)
#     pros.dropna(axis=0, inplace=True)
#     cons.dropna(axis=0, inplace=True)
#     stop = set(stopwords.words('english'))
#     punc = string.punctuation
    
#     pros['pros_tokens'] =pros.Pros.map(clean_tokenize)
#     cons['cons_tokens'] =cons.Cons.map(clean_tokenize)
#     #my_tokens = [get_lemma(token) for token in my_tokens]
#     pros['pros_lemma_tokens'] = pros['pros_tokens'].apply(lemmatize)
#     #pros['pros_lemma_tokens'] = [get_lemma(token) for token in pros['pros_tokens']]
#     cons['cons_lemma_tokens'] = cons['cons_tokens'].apply(lemmatize)
#     return pros, cons
    
    


    
    

In [8]:
insta_captions['clean_tokens'] =insta_captions.hashtags.map(clean_tokenize)

In [9]:
insta_captions.head()

Unnamed: 0,hashtags,Engagement_score,clean_tokens
0,"['happypills', 'pills', 'pharma', 'bigpharma']",-0.63576,"[happypills, pills, pharma, bigpharma]"
1,"['1', '2']",-0.075393,"[1, 2]"
2,"['FollowMe', 'Madagascar', 'Enoughness', 'natu...",-0.381748,"[followme, madagascar, enoughness, nature]"
3,"['snowstorm', 'penguin', 'antarctica']",1.064558,"[snowstorm, penguin, antarctica]"
4,"['whales', 'humpbackwhales', 'parenting', 'pla...",0.036183,"[whales, humpbackwhales, parenting, planetofth..."


# Topic Modeling

In [10]:
from gensim import corpora
dictionary = corpora.Dictionary(insta_captions['clean_tokens'])
corpus = [dictionary.doc2bow(text) for text in insta_captions['clean_tokens']]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')



In [36]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.064*"followme" + 0.028*"nature" + 0.019*"antarctica" + 0.017*"ocean" + 0.016*"wildlife"')
(1, '0.025*"simonnorfolk" + 0.023*"documentaryphotography" + 0.014*"photojournalism" + 0.014*"everydayrefugees" + 0.013*"archaeology"')
(2, '0.031*"okavangolions" + 0.020*"thisismytrophy" + 0.019*"nature" + 0.019*"wildlife" + 0.017*"tsaropride"')
(3, '0.028*"onassignment" + 0.021*"muhammedmuheisen" + 0.020*"stephenwilkes" + 0.018*"daytonight" + 0.015*"photojournalism"')
(4, '0.033*"kenya" + 0.026*"africa" + 0.018*"photojournalism" + 0.018*"nature" + 0.015*"northernkenya"')


# Top Topics

* Landscapes
* Refugees
* Large Wildcats
* Top NatGeo Photographers
* Africa


### Ordering by Engagement Score

In [12]:
# sorting the df
insta_captions.sort_values(by='Engagement_score', ascending=False)

Unnamed: 0,hashtags,Engagement_score,clean_tokens
776,"['blackrhino', 'poachers', 'Ivory']",14.102355,"[blackrhino, poachers, ivory]"
836,[],12.622301,[]
330,[],4.637615,[]
542,[],4.221715,[]
161,[],3.354633,[]
338,[],3.090159,[]
170,[],3.059542,[]
46,"['penguin', 'antarctica']",2.865012,"[penguin, antarctica]"
850,[],2.552083,[]
603,"['whales', 'beluga']",2.275912,"[whales, beluga]"


## Finding Quantiles

In [13]:
lower_quantile, upper_quantile = insta_captions.Engagement_score.quantile([.25, .75])

In [15]:
higher = insta_captions[insta_captions['Engagement_score'] > upper_quantile]
lower = insta_captions[insta_captions['Engagement_score'] > lower_quantile]

In [16]:
def topics_reveal(high_df, low_df, num_topics = 5, num_words = 4):
    dictionary = corpora.Dictionary(high_df['clean_tokens'])
    corpus = [dictionary.doc2bow(text) for text in high_df['clean_tokens']]
    pickle.dump(corpus, open('corpus_high.pkl', 'wb'))
    NUM_TOPICS = num_topics
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    topics_high = ldamodel.print_topics(num_words=4)
    print(" For Upper Quartile, 5 topics are defined as follows: ")
    for topic in topics_high:
        print(topic)
        
    dictionary = corpora.Dictionary(low_df['clean_tokens'])
    corpus = [dictionary.doc2bow(text) for text in low_df['clean_tokens']]
    pickle.dump(corpus, open('corpus_low.pkl', 'wb'))
    NUM_TOPICS = num_topics
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    topics_low = ldamodel.print_topics(num_words=num_words)
    print(" For Lower Quartile, 5 topics are defined as follows: ")
    for topic in topics_low:
        print(topic)
    return topics_high, topics_low
    print(type(topics_high))

#     top_words_per_topic = []
#     for t in range(lda_model.num_topics):
#         top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 5)])

#         pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("top_words.csv")

In [17]:
t = topics_reveal(higher, lower)

 For Upper Quartile, 5 topics are defined as follows: 
(0, '0.037*"maasaimara" + 0.025*"daytonight" + 0.025*"stephenwilkes" + 0.021*"kenya"')
(1, '0.025*"onassignment" + 0.013*"nature" + 0.013*"africa" + 0.013*"bigcatweekend"')
(2, '0.035*"okavangolions" + 0.023*"kenya" + 0.022*"followme" + 0.020*"stoppoaching"')
(3, '0.099*"followme" + 0.057*"antarctica" + 0.035*"penguin" + 0.029*"nature"')
(4, '0.032*"wildlife" + 0.032*"commonground" + 0.022*"publiclands" + 0.016*"marinewildlife"')
 For Lower Quartile, 5 topics are defined as follows: 
(0, '0.026*"kenya" + 0.026*"stephenwilkes" + 0.024*"africa" + 0.021*"nature"')
(1, '0.043*"followme" + 0.027*"nature" + 0.026*"wildlife" + 0.017*"muhammedmuheisen"')
(2, '0.029*"followme" + 0.018*"everydayrefugees" + 0.010*"wildlifephotography" + 0.007*"onassignment"')
(3, '0.029*"antarctica" + 0.022*"followme" + 0.022*"commonground" + 0.013*"publiclands"')
(4, '0.033*"floridawild" + 0.020*"pathofthepanther" + 0.016*"japan" + 0.016*"keepflwild"')


## Transferring topics into dataframe

In [18]:
num_topics = 5
num_words = 4


dictionary = corpora.Dictionary(higher['clean_tokens'])
corpus = [dictionary.doc2bow(text) for text in higher['clean_tokens']]
pickle.dump(corpus, open('corpus_high.pkl', 'wb'))
UM_TOPICS = num_topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics_high = ldamodel.print_topics(num_words=4)    

top_words_per_topic = []
for t in range(ldamodel.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in ldamodel.show_topic(t, topn = 5)])

    high_q = pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P'])    

        
dictionary = corpora.Dictionary(lower['clean_tokens'])
corpus = [dictionary.doc2bow(text) for text in lower['clean_tokens']]
pickle.dump(corpus, open('corpus_low.pkl', 'wb'))
NUM_TOPICS = num_topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics_low = ldamodel.print_topics(num_words=num_words) 

top_words_per_topic = []
for t in range(ldamodel.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in ldamodel.show_topic(t, topn = 5)])

    low_q = pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']) 
    




In [19]:
#Finding the overall mean for a topic

low_group = low_q.groupby(['Word']).mean()
high_group = high_q.groupby(['Word']).mean()
low_group.drop('Topic', axis=1, inplace=True)
high_group.drop('Topic', axis=1, inplace=True)

In [21]:
#calculating the difference between the upper and lower quartile

table = high_group.merge(low_group, on='Word', suffixes=('_high', '_low'))
table['difference'] = (table['P_high']-table['P_low'])
table['abs_dif'] = abs(table['difference'])

In [22]:
table.sort_values('abs_dif', ascending=False)

Unnamed: 0_level_0,P_high,P_low,difference,abs_dif
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
antarctica,0.058818,0.019529,0.039289,0.039289
followme,0.047489,0.025868,0.02162,0.02162
penguin,0.040248,0.019692,0.020556,0.020556
nature,0.026782,0.016026,0.010756,0.010756
okavangolions,0.032989,0.022262,0.010727,0.010727
stephenwilkes,0.016113,0.025773,-0.00966,0.00966
commonground,0.024149,0.015511,0.008638,0.008638
daytonight,0.016104,0.024373,-0.008269,0.008269
conservation,0.021594,0.013861,0.007734,0.007734
wildlifephotography,0.018438,0.011465,0.006973,0.006973


### Recommendation: To increase engagement, NatGeo should make posts with its top photographers showing natural landscapes, specifically the Artic and African landscapes with its wildlife.