In [1]:
###### IMPORTANT NOTEBOOK FOR REPO X2

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [4]:
# Import needed packages
import csv
import search
import simulate_keywords
from google_client import GoogleClient

In [23]:
 # !!! Remove before uploading code
%env TRENDS_DEVELOPER_KEY=""

env: TRENDS_DEVELOPER_KEY=""


In [6]:
# Control your initial search terms which you would like to run the simulation for here
initial_search_terms = ['home abortion']

In [7]:
def load_geolocations(): 
    locations = []
    with open(simulate_keywords.Simulation.LOCATIONS_FILE, "r") as csvfile: 
        reader = csv.DictReader(csvfile)
        for row in reader: 
            code = row["geo_code"]
            description = row["description"]
            locations.append({"code": code, "description": description})
    return locations

In [8]:
"""
Generate master list of top queries for all geolocations during the specified time period
Get relative search volume of top queries for initial search term
"""
def run_simulation(initial_search_term, startDateTrends='2019-11', endDateTrends='2020-11', startDateTimelines='2019-11-01', endDateTimelines='2020-11-01'): 
    master_list = set()
    relative_search_volumes = dict()
    #This will contain a mapping of Location -> list of top queries and their associated relative search index
    top_queries_relative_search_index = dict()
    for loc in load_geolocations(): 
        
        simulation = simulate_keywords.Simulation(initial_search_term, loc, startDateTrends, endDateTrends, startDateTimelines, endDateTimelines)
        simulation.generate_keywords()
        simulation.get_relative_search_volumes()
        simulation.generate_simulation_csvs()
        level_queries = {}
        queryLevel = 1
        """//**for level in simulation.initial_queries['level']:
            top_queries_in_level = [q['query'] for q in simulation.initial_queries if q['level'] == level]
            top_queries[level] = top_queries_in_level
            for query in top_queries_in_level:
                master_list.add(query)
                if (queryLevel not in level_queries.keys()):
                    level_queries[queryLevel] = [query]
                else:
                    level_queries[queryLevel] = level_queries[queryLevel].append(query)
            queryLevel +=1
        while (run_simulation(initial_search_term, startDateTrends='2019-08', 
                              endDateTrends='2020-08', startDateTimelines='2019-08-01', 
                              endDateTimelines='2020-08-01'))*//"""
        print(simulation.initial_queries)       
        #top_queries = [q['query'] for q in simulation.initial_queries if q['level'] == 1]
        top_queries = [q['query'] for q in simulation.initial_queries]
        query_rel_search_index = [{'query': q['query'], 'value': q['value']} for q in simulation.initial_queries]
        # each q holds the properties of each relevant search query ie. the query, query level, etc.
        for query in top_queries: 
             master_list.add(query)
        relative_search_volumes[loc['code']] = simulation.relative_search_volumes
        top_queries_relative_search_index[loc['code']] = query_rel_search_index
    return master_list, relative_search_volumes, top_queries_relative_search_index, simulation.initial_queries

## STEP 1  

In [9]:
# Ashley Kong October 18, 2020
# remove duplicate relevant search terms 
# and return list of dictionaries of 
# unique queries and their respective relevant serach indices
def remove_duplicates(running_log_of_relevant_terms_list, list_of_dictionaries) :
                    # [home abortion, etc.]      all_relative_search_volumes[initial search term]['US'] 
    unique_queries = {}
    for dictionary in list_of_dictionaries:
        query = list(dictionary.keys())[0] #ex. 'abortion pill'
        if query not in running_log_of_relevant_terms_list:
            running_log_of_relevant_terms_list.append(query)
            unique_queries[query] = dictionary[query]
    unique_queries = restandardize_rsi(unique_queries)
    return unique_queries

# Ashley Kong November 17, 2020
# Restandardize the relative search index
# after obtaining unique list of relevant search terms
# for a given initial search term(can be a term from a specific level)

def restandardize_rsi(unique_queries_dict): 
                    # {abortion pill: ###, etc.}
    total = sum([item for item in list(unique_queries_dict.values())])
    for query in list(unique_queries_dict.keys()):
        unique_queries_dict[query] = unique_queries_dict[query]/ total
    return unique_queries_dict


# Ashley Kong November 17, 2020
# Running log of all levels and terms in a single iteration
def add_level_log(new_level, origin_initial_term, dictionary_rsi_terms):
                # ex. 1, 'abortion pill', ('abortion': ###)
                # origin_initial_term will give us the previous term
                # that the new level relevant search terms are for
    
    level_dict = {}
    #check if the level exists in level_log
    if new_level in list(level_log.keys()):
        level_dict = level_log[new_level] #ex. level_log[1] --> 
                                    #{'abortion' ie. prev level term:{this level term: ####, etc.}}   
    level_dict[origin_initial_term] = dictionary_rsi_terms
    level_log[new_level] = level_dict
    return level_log
    



In [10]:
"""
Run the simulation for all initial search terms 
Generates a dictionary mapping initial_search_term to its master list for all locations
Generates a dictionary mapping initial_serach_term to its relative search volumes for all locations
"""
all_master_lists = dict()
all_relative_search_volumes = dict()
all_top_queries_rsi = dict()
lst_initial_queries = []
for initial_search_term in initial_search_terms: 
    # Note: must specify explicit startDate & endDate unless wish to use default values
    master_list, relative_search_volumes, top_queries_rsi, initial_queries = run_simulation(initial_search_term)
    all_master_lists[initial_search_term] = list(master_list)
    all_relative_search_volumes[initial_search_term] = relative_search_volumes
    all_top_queries_rsi[initial_search_term] = top_queries_rsi
    lst_initial_queries = initial_queries

[{'query': 'at home abortion', 'value': 100, 'level': 1, 'follow_up_terms': []}, {'query': 'abortion pill', 'value': 27, 'level': 1, 'follow_up_terms': []}, {'query': 'home remedies for abortion', 'value': 14, 'level': 1, 'follow_up_terms': []}, {'query': 'at home abortions', 'value': 11, 'level': 1, 'follow_up_terms': []}, {'query': 'how to have an abortion at home', 'value': 10, 'level': 1, 'follow_up_terms': []}, {'query': 'at home abortion methods', 'value': 2, 'level': 1, 'follow_up_terms': []}]


In [11]:
#ONLY APPLIED TO THE LEVEL 1 TERMS AKA THE ORIGINAL INITIAL SEARCH TERM'S FOLLOW UP TERMS
# RUN THIS ***ONLY ONCE*** FOR THE ORIGINAL INITIAL SEARCH TERM
rsv = all_relative_search_volumes['home abortion']['US']

queries = [list(d.keys())[0] for d in rsv]
rsvs = [list(d.values())[0] for d in rsv]
rsv_df = pd.DataFrame(list(zip(queries, rsvs)), 
               columns =['Queries', 'Relative Search Volume'])
rsv_df = rsv_df.sort_values('Relative Search Volume', ascending = False)
rsv_df

Unnamed: 0,Queries,Relative Search Volume
1,abortion pill,0.936026
0,at home abortion,0.039269
3,at home abortions,0.010861
2,home remedies for abortion,0.007403
4,how to have an abortion at home,0.005408
5,at home abortion methods,0.001033


In [159]:
rsv_df = rsv_df.sort_values('Relative Search Volume', ascending = False)
rsv_df.to_csv ('homeabortion_rsv.csv', header=True)

In [12]:
all_top_queries_rsi

{'home abortion': {'US': [{'query': 'at home abortion', 'value': 100},
   {'query': 'abortion pill', 'value': 27},
   {'query': 'home remedies for abortion', 'value': 14},
   {'query': 'at home abortions', 'value': 11},
   {'query': 'how to have an abortion at home', 'value': 10},
   {'query': 'at home abortion methods', 'value': 2}]}}

## STEP 2 Standardize RSI's and Remove Duplicates

In [13]:
# RESTANDARDIZE RSI BY REMOVING DUPLICATES
# Need to specify initial search term or origin level search term
# Skip for level 1 queries there will be no duplicates
all_top_queries_rsi['home abortion']["US"]

[{'query': 'at home abortion', 'value': 100},
 {'query': 'abortion pill', 'value': 27},
 {'query': 'home remedies for abortion', 'value': 14},
 {'query': 'at home abortions', 'value': 11},
 {'query': 'how to have an abortion at home', 'value': 10},
 {'query': 'at home abortion methods', 'value': 2}]

In [14]:
def structure_rsi_dictionary(all_top_queries_rsi_lists_of_dictionaries):
    rsi_list = []
    for rsi_dictionary in all_top_queries_rsi_lists_of_dictionaries:
        dictionary = {}
        dictionary[rsi_dictionary['query']] = rsi_dictionary['value']
        rsi_list.append(dictionary)
    return rsi_list

In [15]:
list_of_rsi = structure_rsi_dictionary(all_top_queries_rsi['home abortion']["US"])
d = list_of_rsi
d

[{'at home abortion': 100},
 {'abortion pill': 27},
 {'home remedies for abortion': 14},
 {'at home abortions': 11},
 {'how to have an abortion at home': 10},
 {'at home abortion methods': 2}]

In [16]:
running_log_of_relevant_terms_list

[]

In [17]:
def remove_dup(running_log_of_relevant_terms_list, list_of_rsi_dicts) :
                    # [home abortion, etc.]      all_relative_search_volumes[initial search term]['US'] 
    unique_queries = {}
    for dictionary in list_of_rsi_dicts:
        query = list(dictionary.keys())[0] #ex. 'abortion pill'
        if query not in running_log_of_relevant_terms_list:
            running_log_of_relevant_terms_list.append(query)
            unique_queries[query] = dictionary[query]
    return unique_queries

In [18]:
running_log_of_relevant_terms_list

[]

In [19]:
#                     running_log_of_relevant_terms_list, lst_initial_queries[initial_search_term][location]
#d = remove_duplicates(running_log_of_relevant_terms_list, all_relative_search_volumes['abortion at home']['US'])
#d = remove_duplicates(running_log_of_relevant_terms_list, list_of_rsi)
d = remove_dup(running_log_of_relevant_terms_list, list_of_rsi)
d

{'at home abortion': 100,
 'abortion pill': 27,
 'home remedies for abortion': 14,
 'at home abortions': 11,
 'how to have an abortion at home': 10,
 'at home abortion methods': 2}

## STEP 3 Update running log for a single iteration

In [20]:
# Check this Step (the cell below) 
# to see what level we are at/ what terms to look at next

# NOTE: If a NoneType if received for an origin_inital_term we 
#       will not provide a dictionary for that origin_initial_term

In [21]:
#Repeat for each level term

# Level, initial search term, list of dictionaries with unique queries and respective rsi's
add_level_log(1, 'home abortion', d)
#level_log[0] --> level 1 terms and level 1 term rsv
                 # EX. {'how to do abortion': {'abortion pill': ####, etc.} }
    
#level_log[1] --> level 2 terms and level 2 term rsv's 
                # (ie. the relevant search terms of level 1 terms)
                # EX. {'abortion pill': {'some term': ###, etc.} }


{0: 'home abortion',
 1: {'home abortion': {'at home abortion': 100,
   'abortion pill': 27,
   'home remedies for abortion': 14,
   'at home abortions': 11,
   'how to have an abortion at home': 10,
   'at home abortion methods': 2}}}

In [22]:
# Log of current unique queries
running_log_of_relevant_terms_list

['at home abortion',
 'abortion pill',
 'home remedies for abortion',
 'at home abortions',
 'how to have an abortion at home',
 'at home abortion methods']

## STEP 4 Store Iteration results 

In [5]:
import json

In [None]:
# We can see which levels ended 
# by looking at the column level ie. if there is NaN
df = pd.DataFrame(level_log)
df.head(20)

In [None]:
# Name initialsearchterm_iteration#.txt
json.dump(level_log, open("homeabortion.txt",'w'))

## STEP 0 Initialization of logs  

In [1]:
#initialization at the beginning of each iteration
running_log_of_relevant_terms_list = []
rsv = []


In [2]:
#level_log = {0: initial search term}
level_log = {0:'home abortion'}
            #level.  terms etc. {(level 1 abortion pill): {term: rsv}, ... 