In [1]:
###### IMPORTANT NOTEBOOK FOR REPO X3

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [3]:
# Import needed packages
import csv
import search
import simulate_keywords
from google_client import GoogleClient

In [165]:
 # !!! Remove before uploading code
%env TRENDS_DEVELOPER_KEY=''

env: TRENDS_DEVELOPER_KEY=''


In [135]:
# Control your initial search terms which you would like to run the simulation for here
initial_search_terms = ['over the counter pill']

In [136]:
def load_geolocations(): 
    locations = []
    with open(simulate_keywords.Simulation.LOCATIONS_FILE, "r") as csvfile: 
        reader = csv.DictReader(csvfile)
        for row in reader: 
            code = row["geo_code"]
            description = row["description"]
            locations.append({"code": code, "description": description})
    return locations

In [137]:
"""
Generate master list of top queries for all geolocations during the specified time period
Get relative search volume of top queries for initial search term
"""
def run_simulation(initial_search_term, startDateTrends='2019-11', endDateTrends='2020-11', startDateTimelines='2019-11-01', endDateTimelines='2020-11-01'): 
    master_list = set()
    relative_search_volumes = dict()
    #This will contain a mapping of Location -> list of top queries and their associated relative search index
    top_queries_relative_search_index = dict()
    for loc in load_geolocations(): 
        
        simulation = simulate_keywords.Simulation(initial_search_term, loc, startDateTrends, endDateTrends, startDateTimelines, endDateTimelines)
        simulation.generate_keywords()
        simulation.get_relative_search_volumes()
        simulation.generate_simulation_csvs()
        level_queries = {}
        queryLevel = 1
        """//**for level in simulation.initial_queries['level']:
            top_queries_in_level = [q['query'] for q in simulation.initial_queries if q['level'] == level]
            top_queries[level] = top_queries_in_level
            for query in top_queries_in_level:
                master_list.add(query)
                if (queryLevel not in level_queries.keys()):
                    level_queries[queryLevel] = [query]
                else:
                    level_queries[queryLevel] = level_queries[queryLevel].append(query)
            queryLevel +=1
        while (run_simulation(initial_search_term, startDateTrends='2019-08', 
                              endDateTrends='2020-08', startDateTimelines='2019-08-01', 
                              endDateTimelines='2020-08-01'))*//"""
        
        #print(simulation.initial_queries)   
        
        #top_queries = [q['query'] for q in simulation.initial_queries if q['level'] == 1]
        top_queries = [q['query'] for q in simulation.initial_queries]
        query_rel_search_index = [{'query': q['query'], 'value': q['value']} for q in simulation.initial_queries]
        # each q holds the properties of each relevant search query ie. the query, query level, etc.
        for query in top_queries: 
             master_list.add(query)
        relative_search_volumes[loc['code']] = simulation.relative_search_volumes
        top_queries_relative_search_index[loc['code']] = query_rel_search_index
    return master_list, relative_search_volumes, top_queries_relative_search_index, simulation.initial_queries

## STEP 1  

In [138]:
# Ashley Kong October 18, 2020
# remove duplicate relevant search terms 
# and return list of dictionaries of 
# unique queries and their respective relevant serach indices
def remove_duplicates(running_log_of_relevant_terms_list, list_of_dictionaries) :
                    # [home abortion, etc.]      all_relative_search_volumes[initial search term]['US'] 
    unique_queries = {}
    for dictionary in list_of_dictionaries:
        query = list(dictionary.keys())[0] #ex. 'abortion pill'
        if query not in running_log_of_relevant_terms_list:
            running_log_of_relevant_terms_list.append(query)
            unique_queries[query] = dictionary[query]
    unique_queries = restandardize_rsi(unique_queries)
    return unique_queries

# Ashley Kong November 17, 2020
# Restandardize the relative search index
# after obtaining unique list of relevant search terms
# for a given initial search term(can be a term from a specific level)

def restandardize_rsi(unique_queries_dict): 
                    # {abortion pill: ###, etc.}
    total = sum([item for item in list(unique_queries_dict.values())])
    for query in list(unique_queries_dict.keys()):
        unique_queries_dict[query] = unique_queries_dict[query]/ total
    return unique_queries_dict


# Ashley Kong November 17, 2020
# Running log of all levels and terms in a single iteration
def add_level_log(new_level, origin_initial_term, dictionary_rsi_terms):
                # ex. 1, 'abortion pill', ('abortion': ###)
                # origin_initial_term will give us the previous term
                # that the new level relevant search terms are for
    
    level_dict = {}
    #check if the level exists in level_log
    if new_level in list(level_log.keys()):
        level_dict = level_log[new_level] #ex. level_log[1] --> 
                                    #{'abortion' ie. prev level term:{this level term: ####, etc.}}   
    level_dict[origin_initial_term] = dictionary_rsi_terms
    level_log[new_level] = level_dict
    return level_log
    



In [139]:
"""
Run the simulation for all initial search terms 
Generates a dictionary mapping initial_search_term to its master list for all locations
Generates a dictionary mapping initial_serach_term to its relative search volumes for all locations
"""
all_master_lists = dict()
all_relative_search_volumes = dict()
all_top_queries_rsi = dict()
lst_initial_queries = []
for initial_search_term in initial_search_terms: 
    # Note: must specify explicit startDate & endDate unless wish to use default values
    master_list, relative_search_volumes, top_queries_rsi, initial_queries = run_simulation(initial_search_term)
    all_master_lists[initial_search_term] = list(master_list)
    all_relative_search_volumes[initial_search_term] = relative_search_volumes
    all_top_queries_rsi[initial_search_term] = top_queries_rsi
    lst_initial_queries = initial_queries

## STEP 2 Acquire initial RSV/RSI

In [140]:
# Ashley Kong January 27, 2021
# ONLY APPLIED TO THE LEVEL 1 TERMS AKA THE ORIGINAL INITIAL SEARCH TERM'S FOLLOW UP TERMS
# RUN THIS ***ONLY ONCE*** FOR THE ORIGINAL INITIAL SEARCH TERM

#*** 30 iteration run for rsv and rsi start
rsv = all_relative_search_volumes[initial_search_terms[0]]['US']

queries = [list(d.keys())[0] for d in rsv]
rsvs = [list(d.values())[0] for d in rsv]
rsv_df = pd.DataFrame(list(zip(queries, rsvs)), 
               columns =['Queries', 'Relative Search Volume'])
rsv_df = rsv_df.sort_values('Relative Search Volume', ascending = False)
rsv_df

Unnamed: 0,Queries,Relative Search Volume
3,yeast infection,0.679489
9,pill identifier,0.219542
8,plan b pill,0.06239
7,viagra pill,0.019676
0,yeast infection pill,0.01435
1,over the counter yeast infection pill,0.001388
2,sleeping pill over the counter,0.000871
4,over the counter water pill,0.000749
5,best over the counter diet pill,0.000696
6,best over the counter weight loss pill,0.000609


In [144]:
rsi = {d['query']:d['value'] for d in all_top_queries_rsi[initial_search_terms[0]]["US"]}
rsi

{'yeast infection pill': 100,
 'over the counter yeast infection pill': 95,
 'sleeping pill over the counter': 95,
 'yeast infection': 90,
 'over the counter water pill': 71,
 'best over the counter diet pill': 52,
 'best over the counter weight loss pill': 52,
 'viagra pill': 52,
 'plan b pill': 47,
 'pill identifier': 9,
 'abortion pill over the counter cvs': 9}

## STEP 3 Conduct 30 iterations for initial term search

In [142]:
# Ashley Kong January 27, 2021
# Calculate the average relative search volume for 30 iterations
# NOTE: RSV's will be calculated for each 30 iteration run, but we only want 
#       the RSV's for the top queries. Do not use the RSV for level 2 terms and beyond.

# The process to get 30 iterations for the 'over the counter pill' search took: 3min. 2sec. 95 ms.
# The process to get 1 iteration for the 'over the counter pill' search took: 3min. 27sec. 34ms.
# If we were to have done 30 iterations using the old method it would have taken: 1.725 hours.
# With this procedure we have improved our search algorithm, it now takes 2.93% of the original time
# it would have to conduct a 30 iteration sample of a search for a given query.

#### CODE FOR REPO
for i in range(29):
    for initial_search_term in initial_search_terms: 
        master_list, relative_search_volumes, top_queries_rsi, initial_queries = run_simulation(initial_search_term)
        all_master_lists[initial_search_term] = list(master_list)
        all_relative_search_volumes[initial_search_term] = relative_search_volumes
        all_top_queries_rsi[initial_search_term] = top_queries_rsi
        lst_initial_queries = initial_queries

        # Info for this iteration
        rsv = all_relative_search_volumes[initial_search_terms[0]]['US']
        
        # Query and RSV for this iteration
        queries = [list(d.keys())[0] for d in rsv]
        rsvs = [list(d.values())[0] for d in rsv]
        
        new_data = {queries[i]: rsvs[i] for i in range(len(queries))}

        old_queries = rsv_df['Queries'].tolist()
        old_rsvs = rsv_df['Relative Search Volume'].tolist()
        
        query_list = rsv_df['Queries'].tolist()
                
        for new_query in new_data.keys():
            if new_query in query_list:
                old_rsvs[old_queries.index(new_query)] = old_rsvs[old_queries.index(new_query)] + new_data[new_query]
            else:
                old_queries.append(new_query)
                old_rsvs.append(new_data[new_query])
                query_list.append(new_query)
        
        rsv_df = pd.DataFrame({"Queries": old_queries,
                             "Relative Search Volume": old_rsvs})
        
        # Iteration for calculating relative search indexes for initial search term
        
        new_rsi = {d['query']:d['value'] for d in all_top_queries_rsi[initial_search_terms[0]]["US"]}
        old_rsi_query = rsi.keys()
        old_rsis = rsi.values()
        new_rsi_query = new_rsi.keys()
        new_rsis = new_rsi.values()
        
        q_list = list(rsi.keys())
        
        for new_query in new_rsi.keys():
            if new_query in q_list:
                rsi[new_query] = rsi[new_query] + new_rsi[new_query]
            else: 
                rsi[new_query] = new_rsi[new_query]
                q_list.append(new_query)
                
rsi= {q: rsi[q]/30 for q in rsi.keys()}
rsv_df['Relative Search Volume']  =  [rel_search_vol/ 30 for rel_search_vol in rsv_df['Relative Search Volume'].tolist()] 
rsv_df = rsv_df.sort_values('Relative Search Volume', ascending = False)
rsv_df


Unnamed: 0,Queries,Relative Search Volume
0,yeast infection,0.679489
1,pill identifier,0.219542
2,plan b pill,0.06239
3,viagra pill,0.019676
4,yeast infection pill,0.01435
5,over the counter yeast infection pill,0.001388
6,sleeping pill over the counter,0.000871
7,over the counter water pill,0.000749
8,best over the counter diet pill,0.000696
9,best over the counter weight loss pill,0.000609


In [145]:
rsi_df = pd.DataFrame({"Query": rsi.keys(), "Relative Search Volume": rsi.values()})
rsi_df

Unnamed: 0,Query,Relative Search Volume
0,yeast infection pill,100
1,over the counter yeast infection pill,95
2,sleeping pill over the counter,95
3,yeast infection,90
4,over the counter water pill,71
5,best over the counter diet pill,52
6,best over the counter weight loss pill,52
7,viagra pill,52
8,plan b pill,47
9,pill identifier,9


## STEP 4 Save 30 Iteration RSV/RSI as .csv files 

In [75]:
# Convert the relative search volume dataframe and relative search index dataframe into .csv files
rsv_df.to_csv ('overthecounterpill_rsv.csv', header=True)
rsv_df.to_csv ('ocp_abortionpill_abortionpillonline_rsi.csv', header=True)

In [146]:
all_top_queries_rsi

{'over the counter pill': {'US': [{'query': 'yeast infection pill',
    'value': 100},
   {'query': 'over the counter yeast infection pill', 'value': 95},
   {'query': 'sleeping pill over the counter', 'value': 95},
   {'query': 'yeast infection', 'value': 90},
   {'query': 'over the counter water pill', 'value': 71},
   {'query': 'best over the counter diet pill', 'value': 52},
   {'query': 'best over the counter weight loss pill', 'value': 52},
   {'query': 'viagra pill', 'value': 52},
   {'query': 'plan b pill', 'value': 47},
   {'query': 'pill identifier', 'value': 9},
   {'query': 'abortion pill over the counter cvs', 'value': 9}]}}