# <center> Assignment 4: Is a Picture Worth a Thousand Words? </center>
## <center> Matt Viteri, Abhilash Gupta, Stephen Darasimi Oluwaniyi, Colin Chu </center>

In [7]:
import pandas as pd
import numpy as np
from google.cloud import vision
import os
import nltk
import gensim
import pyLDAvis
import pyLDAvis.gensim
import json
import warnings

warnings.filterwarnings('ignore')
nltk.download('stopwords');

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="APAD-DARASIMI-8c1a66d445ac.json"
client = vision.ImageAnnotatorClient()
image = vision.types.Image()


ModuleNotFoundError: No module named 'google.cloud'

## Task 0. 

On Instagram, choose the National Geographic (natgeo) page (do not use hashtags). Write a
scraper or use the Web Scraper to extract 
- Image URLs (do not extract video URLs, it may end up costing you a lot of money to run analytics on video)
- Post caption (the text description of a post)
- Number of likes
- Number of comments (You don’t need actual comments for this assignment)
- Scrape around 400-500 image posts. 

Using the image URLs, obtain image labels from Google Vision cloud (you will have to create an
account with Google to get your credentials as a json file, though the first $300 are free, which
should be more than plenty for this assignment)
- You will need to write a script to access the Google Vision API.

In [None]:
## See scraper in attached file
data = pd.read_csv("output.csv")
data.head()

## Task A. 

Create a metric for engagement by using a weighted sum of # likes and # comments. 
- First normalize # likes and # comments such that they both have values between 0 and 1. You can scale the # likes by dividing by the maximum # likes (for a post) in your data and do the same for # comments, so that # likes and comments will be in the range [0,1]
- Now create an <b> engagement score = .4*# likes (normalized) + .6*# comments (normalized) </b>
- Define High (1) and Low (0) engagement based on whether the engagement score is above or below the median value

### Normalizing Likes & Comments

In [None]:
Likes = data['likes']
Norm_Likes = (Likes-Likes.min())/(Likes.max()-Likes.min())
print(Norm_Likes.head())
print("The range of Norm_Likes is between %i and %i"%(Norm_Likes.max(),Norm_Likes.min()))
print()
Comments = data['comments']
Norm_Comments = (Comments-Comments.min())/(Comments.max()-Comments.min())
print(Norm_Comments.head())
print("\n The range of Norm_Comments is between %i and %i"%(Norm_Comments.max(),Norm_Comments.min()))

### Engagement Score

In [None]:
Engagement_Score = 0.4*Norm_Likes + 0.6*Norm_Comments
print(Engagement_Score.head())
Engagement_Median = Engagement_Score.median()
print("The Engagement_Median is %f"% Engagement_Median)

### Determine Engagement [ 1 or 0 ] base on Engagement Score

In [None]:
Engagement = Engagement_Score.apply(lambda x: 1 if x >= Engagement_Median else 0)
print(Engagement.head())

In [None]:
data['Norm_Likes'] = Norm_Likes
data['Norm_Comments'] = Norm_Comments
data['Engagement_Score'] = Engagement_Score
data['Engagement'] = Engagement

In [None]:
data.to_csv('TaskA.csv', index=False)

In [None]:
## Getting the image labels

def detect_labels_uri(uri):
    """Detects labels in the file located in Google Cloud Storage or on the
    Web."""
    image.source.image_uri = uri
    response = client.label_detection(image=image)
    labels = response.label_annotations
    label_description = []
    for label in labels:
        label_description.append(label.description)
    return label_description

In [None]:
df = pd.read_csv('TaskA.csv')

In [None]:
df["image_labels"] = df["media_url"].apply(lambda x: detect_labels_uri(x))

In [None]:
df = df[['type','media_url','image_labels','caption','likes','comments','Norm_Likes','Norm_Comments','Engagement_Score','Engagement']]

In [None]:
df.to_csv('image_labels.csv', index=False)

## Task B

Run a logistic regression with Engagement (binary) as the dependent variable, and the
image labels as independent variables. 
- What is the accuracy (show the confusion matrix)?
- What accuracy do you get by using the post caption words as the independent variables instead of image labels?
- Finally, what accuracy do you get by combining the image labels and post captions and using them as independent variables? What can you conclude from your analysis?
- Note: Doing a word frequency analysis and word replacement on the image labels as well as captions will increase the accuracy of prediction. Needless to say, TF-IDF scores should be used. 

## Task C
Perform topic modeling (LDA) on the image labels. Choose an appropriate number of topics. 
- You may want to start with 5, but adjust the number up or down depending on the word distributions you get
- LDA should produce two outputs: 
    - A file showing which words load on which topics
    - A file showing topic weights for each image. 

In [None]:
NUMBER_OF_TOPICS = 5 

In [None]:
df = pd.read_csv('image_labels.csv')
df['image_labels'] = df['image_labels'].apply(lambda l: json.loads(l.replace('\'', '\"')))

texts = df['image_labels'].tolist()
id2word = gensim.corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=NUMBER_OF_TOPICS, 
                                           random_state=9,
                                           update_every=1,
                                           chunksize=25,
                                           passes=100,
                                           alpha='auto',
                                           per_word_topics=True)

lda_model.print_topics()

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

In [None]:
c1 = pd.DataFrame([(topic, [id2word[t[0]] for t in lda_model.get_topic_terms(topic)]) for topic in range(NUMBER_OF_TOPICS)])
c1.to_csv('TaskCi.csv', header=False, index=False)

In [None]:
def get_topic_weights(corpus):
    result = pd.DataFrame(0, columns=range(NUMBER_OF_TOPICS), index=range(len(corpus)))
    for index, scores in result.iterrows():
        for score in lda_model.get_document_topics(corpus[index]):
            result.iloc[index, score[0]] = score[1]
    return result

c2 = get_topic_weights(corpus)
    
c2.to_csv('TaskCii.csv')

In [None]:
def get_avg_weights(df):
    corpus1 = [id2word.doc2bow(text) for text in df['image_labels'].tolist()]
    weights1 = get_topic_weights(corpus1)
    return weights1.sum()/weights1.shape[0]

df1, df2, df3, df4 = np.array_split(df.sort_values(by='Engagement_Score', ascending=False), 4)

scores1 = get_avg_weights(df1)
scores4 = get_avg_weights(df4)
data = zip(range(NUMBER_OF_TOPICS), c1[1], scores1.tolist(), scores4.tolist(), (scores1-scores4).tolist())

pd.set_option('display.max_colwidth', -1)
end_result = pd.DataFrame(data, columns=['Topics', 'Words', 'Avg. top quartile weight', 'Avg. bottom quartile weight', 'Difference'])
end_result.set_index('Topics').sort_values(by='Difference', ascending=False)

## Task D 

What advice would you give National Geographic if it wants to increase engagement on its Instagram page based on your findings in Tasks B and C? 