In [18]:
import time
from datetime import datetime, timedelta
from dateutil.relativedelta import *
import pandas as pd
import os
import csv
from scipy.stats import pearsonr
import numpy as np
import re
import pytrends
import bash
import random
import pdb
import calendar
import requests
from requests.exceptions import Timeout
from pytrends.exceptions import ResponseError
from torpy.circuit import  CellTimeoutError

In [19]:
def get_cookie():
    from selenium import webdriver
    from selenium.webdriver.edge.service import Service
    options = webdriver.EdgeOptions()
    service = Service(executable_path='/Users/liuy5/Documents/GitHub/GT-MVT-Annually-DMA/msedgedriver')
    

# Create a WebDriver instance for Microsoft Edge
    driver = webdriver.Edge(service=service, options=options )
    driver.get("https://trends.google.com/")
    time.sleep(5)
    cookie = driver.get_cookie("NID")["value"]
    driver.quit()
    return cookie

nid_cookie = f"NID={get_cookie()}"


In [20]:
from pytrends.request import TrendReq

requests_args = {
    'headers' : {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
        "Cookie": f"{nid_cookie}",
    }
}
 



In [21]:
def init():
    from torpy import TorClient

    hostname = 'ifconfig.me'  # It's possible use onion hostname here as well
    with TorClient() as tor:
        # Choose random guard node and create 3-hops circuit
        with tor.create_circuit(3) as circuit:
            # Create tor stream to host
            with circuit.create_stream((hostname, 80)) as stream:
                # Now we can communicate with host
                stream.send(b'GET / HTTP/1.0\r\nHost: %s\r\n\r\n' % hostname.encode())
                recv = stream.recv(1024)
    return 0

In [22]:
def column_name_to_mvt_and_remove_zero_values(df, year):
    df.rename(columns={df.columns[0]: f"MVT_GT_{year}" }, inplace = True)
    df = df[df[f"MVT_GT_{year}"] != 0]
    return df

In [23]:
def gt_by_region_by_year(mvt_keyword, start_year, end_year, geo_level):
    df = pd.DataFrame()
    for i in range(start_year, end_year+1):
            try:
                pytrend.build_payload(mvt_keyword, cat=0, timeframe=f"{i}-01-01 {i}-12-31", geo = "US")
                temp_df = pytrend.interest_by_region(resolution=geo_level, inc_low_vol=True, inc_geo_code=False)
                print(f"{time.now().strf()} Obtaining GT MVT from {i}-01-01 to {i}-12-31 in {geo_level}")
                
            except (ResponseError, Timeout, ConnectionAbortedError, ConnectionError) as error:
                try:
                    print(f"GT Error Occured: {error}")
                    !pip install pytrends --upgrade
                    #init()
                    #rint(f"Let's Go, Onion!")
                    pytrend.build_payload(mvt_keyword, cat=0, timeframe=f"{i}-01-01 {i}-12-31", geo = "US")
                    temp_df = pytrend.interest_by_region(resolution=geo_level, inc_low_vol=True, inc_geo_code=False)
                    print(f"Obtaining GT MVT from {i}-01-01 to {i}-12-31 in {geo_level}")

                except (ResponseError, Timeout, ConnectionAbortedError, ConnectionError) as error:
                    print(f"GT Error Occured: {error}")
                    print(f"Let me sleep for 1 day, till {(datetime.now() + timedelta(days=1)).strftime('%m/%d/%Y, %H:%M:%S')}")
                    print("ZZzzzz...")
                    time.sleep(random.randint(60*60*24, 60*60*24+30,))
                    print("Was a nice sleep, now let me continue...")

                    pytrend.build_payload(mvt_keyword, cat=0, timeframe=f"{i}-01-01 {i}-12-31", geo = "US")
                    temp_df = pytrend.interest_by_region(resolution=geo_level, inc_low_vol=True, inc_geo_code=False)
                    print(f"Obtaining GT MVT from {i}-01-01 to {i}-12-31 in {geo_level}")
                
            temp_df = column_name_to_mvt_and_remove_zero_values(temp_df, i)
            df = pd.concat([df, temp_df.iloc[:, 0]], axis = 1)
            time.sleep(random.randint(600, 660))
    return df

In [24]:
def execute_init():
    while True:
        try:
            init()
            print("init status completed")
            break
        except (TimeoutError, CellTimeoutError, ConnectionResetError) as error:
            print(f"Init Error Occured: {error}")
            print("Let me sleep for 10 seconds")
            print("ZZzzzz...")
            time.sleep(10)
            print("Was a nice sleep, now let me continue...")
            pass
    return 0

In [8]:
def GT_temp_df(start_date, end_date, geo_code, geo_name):
    temp_df = pd.DataFrame()
    while len(temp_df) == 0:
        try:
            pytrend.build_payload(mvt_keyword, cat=0, timeframe=f"{start_date} {end_date}", geo = "US")
            temp_df = pytrend.interest_by_region(resolution='REGION', inc_low_vol=True, inc_geo_code=False)

        except (ResponseError, Timeout, ConnectionAbortedError, ConnectionError) as error:
            try:
                print(f"GT Error Occured: {error}")
                !pip install pytrends --upgrade
                #init()
                #rint(f"Let's Go, Onion!")
                pytrend.build_payload(mvt_keyword, cat=0, timeframe=f"{start_date} {end_date}", geo = "US")
                temp_df = pytrend.interest_by_region(resolution='REGION', inc_low_vol=True, inc_geo_code=False)

            except (ResponseError, Timeout, ConnectionAbortedError, ConnectionError) as error:
                print(f"GT Error Occured: {error}")
                print(f"Let me sleep for 1 day, till {(datetime.now() + timedelta(days=1)).strftime('%m/%d/%Y, %H:%M:%S')}")
                print("ZZzzzz...")
                time.sleep(random.randint(60*60*24, 60*60*24+30,))
                print("Was a nice sleep, now let me continue...")


    print(f"Time {pd.Timestamp.now().strftime('%Y%m%d_%H_%M')} Obtaining GT MVT from {start_date} to {end_date} in {geo_name} metro area")
    time.sleep(random.randint(600, 660))
    temp_df = column_name_to_mvt_and_remove_zero_values(temp_df, geo_name)
    return temp_df

In [9]:
def combine_df_and_temp_df_and_rescale_by_overlap_factors(df, temp_df):
    df = pd.concat([df, temp_df.iloc[:, 0]], axis = 1)
    if len(df.columns) > 1:
        factor_data = df.dropna(how = "any")
        factor_divided = factor_data.iloc[:, 0]/factor_data.iloc[:, 1]
        adjusted_factor = factor_divided.mean()
        df.iloc[:,1] = df.iloc[:,1]*adjusted_factor
        df = df.mean(axis = 1)
    return df

In [10]:
init()

[Errno 61] Connection refused
ERROR:root:[ignored]
Traceback (most recent call last):
  File "/Users/liuy5/anaconda3/lib/python3.11/site-packages/torpy/cell_socket.py", line 63, in connect
    self._socket.connect((self._router.ip, self._router.or_port))
  File "/Users/liuy5/anaconda3/lib/python3.11/ssl.py", line 1412, in connect
    self._real_connect(addr, False)
  File "/Users/liuy5/anaconda3/lib/python3.11/ssl.py", line 1399, in _real_connect
    super().connect(addr)
ConnectionRefusedError: [Errno 61] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/liuy5/anaconda3/lib/python3.11/site-packages/torpy/utils.py", line 79, in newfn
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/liuy5/anaconda3/lib/python3.11/site-packages/torpy/consesus.py", line 235, in renew
    raw_string = self.download_consensus(prev_hash)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


ERROR:torpy.cell_socket:[Errno 61] Connection refused
ERROR:root:[ignored]
Traceback (most recent call last):
  File "/Users/liuy5/anaconda3/lib/python3.11/site-packages/torpy/cell_socket.py", line 63, in connect
    self._socket.connect((self._router.ip, self._router.or_port))
  File "/Users/liuy5/anaconda3/lib/python3.11/ssl.py", line 1412, in connect
    self._real_connect(addr, False)
  File "/Users/liuy5/anaconda3/lib/python3.11/ssl.py", line 1399, in _real_connect
    super().connect(addr)
ConnectionRefusedError: [Errno 61] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/liuy5/anaconda3/lib/python3.11/site-packages/torpy/utils.py", line 79, in newfn
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/liuy5/anaconda3/lib/python3.11/site-packages/torpy/consesus.py", line 235, in renew
    raw_string = self.download_consensus(prev_hash)
                 ^^^^^^^^^^^

0

In [11]:
#basic pytrend settings
pytrend = TrendReq(retries=3, requests_args = requests_args)

In [12]:
'''
kw_list = ["car stolen+find stolen car+report police stolen car+insurance car stolen-dream-check",
    "I raped+raped me+raped report police+being raped+been raped - dream - kobe - trump - porn",
    "my stolen+find my stolen+theft report police+someone stolen my - car - heart - dream - number",
    "burglary+home burglarized+my house broken into+my home broken into+burglary report police - dream",
    "my committed a crime+police arrested my+my arrested+my in jail+my in prison - arrested development",
    "I hate family+parents hate me+I hate my parents+I hate school+I hate teacher+I hate dad+I hate mom"]
'''
#years = [i for i in range(2010, 2016)]
#print(years)
#months = [i for i in range(1, 13, 3)] 
#print(months)

'\nkw_list = ["car stolen+find stolen car+report police stolen car+insurance car stolen-dream-check",\n    "I raped+raped me+raped report police+being raped+been raped - dream - kobe - trump - porn",\n    "my stolen+find my stolen+theft report police+someone stolen my - car - heart - dream - number",\n    "burglary+home burglarized+my house broken into+my home broken into+burglary report police - dream",\n    "my committed a crime+police arrested my+my arrested+my in jail+my in prison - arrested development",\n    "I hate family+parents hate me+I hate my parents+I hate school+I hate teacher+I hate dad+I hate mom"]\n'

### GT Keywords (Test)

In [13]:
mvt_keyword = ["car stolen+find stolen car+report police stolen car+insurance car stolen-dream-check"] 

In [14]:
#gt_by_region_by_year(mvt_keyword, 2011, 2014, "REGION")

In [15]:
#reversed_dict = {value: key for key, value in region_code_dict.items()}

### Dates (From, To)

In [16]:
start_year = 2011
end_year = 2022
geo_level = "DMA"

## Execute Pytrends to Pull Daily Data from Google Trends

In [17]:

#Get 100 times weekly GT MVT data per city
for times in range(100):
    df = gt_by_region_by_year(mvt_keyword, start_year, end_year, geo_level)
    df.to_csv(f"{geo_level}_{start_year}_{end_year}_annually_{pd.Timestamp.now().strftime('%Y%m%d_%H_%M')}.csv")
    print(f"{geo_level}_{start_year}_{end_year}_annually_{pd.Timestamp.now().strftime('%Y%m%d_%H_%M')} file saved")
    time.sleep(random.randint(10200, 10600))

Obtaining GT MVT from 2011-01-01 to 2011-12-31 in DMA
Obtaining GT MVT from 2012-01-01 to 2012-12-31 in DMA
Obtaining GT MVT from 2013-01-01 to 2013-12-31 in DMA
Obtaining GT MVT from 2014-01-01 to 2014-12-31 in DMA
Obtaining GT MVT from 2015-01-01 to 2015-12-31 in DMA
Obtaining GT MVT from 2016-01-01 to 2016-12-31 in DMA
Obtaining GT MVT from 2017-01-01 to 2017-12-31 in DMA
Obtaining GT MVT from 2018-01-01 to 2018-12-31 in DMA
Obtaining GT MVT from 2019-01-01 to 2019-12-31 in DMA
Obtaining GT MVT from 2020-01-01 to 2020-12-31 in DMA
Obtaining GT MVT from 2021-01-01 to 2021-12-31 in DMA
Obtaining GT MVT from 2022-01-01 to 2022-12-31 in DMA
DMA_2011_2022_annually_20240226_20_00 file saved
Obtaining GT MVT from 2011-01-01 to 2011-12-31 in DMA
Obtaining GT MVT from 2012-01-01 to 2012-12-31 in DMA
Obtaining GT MVT from 2013-01-01 to 2013-12-31 in DMA
Obtaining GT MVT from 2014-01-01 to 2014-12-31 in DMA
Obtaining GT MVT from 2015-01-01 to 2015-12-31 in DMA
Obtaining GT MVT from 2016-01-01 

RetryError: HTTPSConnectionPool(host='trends.google.com', port=443): Max retries exceeded with url: /trends/api/widgetdata/comparedgeo?req=%7B%22geo%22%3A+%7B%22country%22%3A+%22US%22%7D%2C+%22comparisonItem%22%3A+%5B%7B%22time%22%3A+%222011-01-01+2011-12-31%22%2C+%22complexKeywordsRestriction%22%3A+%7B%22keyword%22%3A+%5B%7B%22type%22%3A+%22BROAD%22%2C+%22value%22%3A+%22car+stolen%22%7D%2C+%7B%22type%22%3A+%22BROAD%22%2C+%22value%22%3A+%22find+stolen+car%22%7D%2C+%7B%22type%22%3A+%22BROAD%22%2C+%22value%22%3A+%22report+police+stolen+car%22%7D%2C+%7B%22type%22%3A+%22BROAD%22%2C+%22value%22%3A+%22insurance+car+stolen-dream-check%22%7D%5D%2C+%22operator%22%3A+%22OR%22%7D%7D%5D%2C+%22resolution%22%3A+%22DMA%22%2C+%22locale%22%3A+%22en-US%22%2C+%22requestOptions%22%3A+%7B%22property%22%3A+%22%22%2C+%22backend%22%3A+%22IZG%22%2C+%22category%22%3A+0%7D%2C+%22userConfig%22%3A+%7B%22userType%22%3A+%22USER_TYPE_SCRAPER%22%7D%2C+%22includeLowSearchVolumeGeos%22%3A+true%7D&token=APP6_UEAAAAAZd86w2fv-SZP28rOlNlhEpDIhsrwrJX-&tz=360 (Caused by ResponseError('too many 429 error responses'))

In [None]:
test.plot(figsize=(20, 12),  kind ='line')

In [None]:
[print(i/25*23) for i in test2["car stolen+find stolen car+report police stolen car+insurance car stolen-dream-check"]]

In [None]:
kw_list=['Facebook', 'Apple', 'Amazon', 'Netflix', 'Google']
pytrend = TrendReq()
#search interest per region
#run model for keywords (can also be competitors)
pytrend.build_payload(kw_list, timeframe='today 1-m')

# Interest by Region
regiondf = pytrend.interest_by_region()
#looking at rows where all values are not equal to 0
regiondf = regiondf[(regiondf != 0).all(1)]

#drop all rows that have null values in all columns
regiondf.dropna(how='all',axis=0, inplace=True)

#visualise
regiondf.plot(figsize=(20, 12), y=kw_list, kind ='bar')

In [None]:
historicaldf = pytrend.get_historical_interest(['Facebook', 'Apple', 'Amazon', 'Netflix', 'Google'], 
                                               year_start=2020, month_start=1, day_start=1, hour_start=0, 
                                               year_end=2020, month_end=12, day_end=31, hour_end=0, 
                                               cat=0, geo='en-US', gprop='', sleep=0)

#visualise
#plot a timeseries chart
historicaldf.plot(figsize=(20, 12))

#plot seperate graphs, using theprovided keywords
historicaldf.plot(subplots=True, figsize=(20, 12))

In [None]:
def gt_func(start_date_list, end_date_list, resolution, sample_size, path):
#get monthly GT data
    import inflect
    #give ordinal names to numbers
    p = inflect.engine()

    for t in range(sample_size): #sample from GT 100 times
        #time.sleep(random.randint(21600,21668)) # 6 hrs
        print(f"The {p.ordinal(t+1)} pull from GT")
        qt_list = ["Q1","Q2","Q3","Q4"]
        count = 0
        for i, j in zip(start_date_list, end_date_list):
            df = pd.DataFrame()
            #Q1 - Q4
            if count == 4:
                count = 0
            for k in kw_list:
                #q = the name of the quarter
                q = qt_list[count]
                try:
                    pytrend.build_payload(k, cat=0, timeframe=f'{i.strftime("%Y-%m-%d")} {j.strftime("%Y-%m-%d")}',  geo='US', gprop='')
                    df = pd.concat([df, pytrend.interest_by_region(resolution="DMA", inc_low_vol = False, inc_geo_code = False)], axis = 1)
                    time.sleep(random.randint(120, 130))

                except (Timeout, ResponseError) as e:
                        print(f'!!{i.strftime("%Y-%m-%d")} {j.strftime("%Y-%m-%d")}!! ReadTimeoutError')
                        print(f"Whoops! ReadTimeoutError, the {p.ordinal(t+1)} pull aborted, now we take a rest")
                        print(dt.datetime.now().strftime("%Y/%m/%d_%H:%M:%S"))
                        time.sleep(random.randint(86668, 86888)) # 24 hrs
                        print(f"Rest enough, now we continue!")

                        #continue working
                        pytrend.build_payload(k, cat=0, timeframe=f'{i.strftime("%Y-%m-%d")} {j.strftime("%Y-%m-%d")}',  geo='US', gprop='')
                        df = pd.concat([df, pytrend.interest_by_region(resolution="DMA", inc_low_vol=False, inc_geo_code=False)], axis = 1)
                        time.sleep(random.randint(120, 130))

            df.columns = [f"MVT_{i.strftime('%Y')}_{q}", 
                          f"rape_{i.strftime('%Y')}_{q}", 
                          f"larceny_{i.strftime('%Y')}_{q}", 
                          f"burglary_{i.strftime('%Y')}_{q}", 
                          f"known_deliq_{i.strftime('%Y')}_{q}", 
                          f"low_social_cont_{i.strftime('%Y')}_{q}"]
            df.to_csv(f'{path}gt_crime_{i.strftime("%Y")}_{q}_{dt.datetime.now().strftime("%Y%m%d_%H-%M-%S")}.csv')
            count += 1
        print(f"The {p.ordinal(t+1)} pull from GT Done!!")
        print(f'Time finished: {dt.datetime.now().strftime("%Y/%m/%d, %H:%M")}')
        time.sleep(random.randint(10800,10868)) # 3 hrs


    print(f"Congratulations, Well Done!")