# Setup

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
from tqdm import tqdm_notebook
import concurrent.futures
from multiprocessing import Pool

# GoogleSearch工具类

In [2]:
import requests
import re
import time
from lxml import etree
import random
import json

class GoogleSearch():
    def __init__(self):
        self.gl_query = ""
        self.gl_proxies = {}
        self.key = ""
        self.header = {"Referer":"https://www.google.com/",
                       'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
        self.verbosePath = "googleSearchResult_request.html"
      
    def updateProxies(self,proxies):
        self.gl_proxies = proxies

    def checkCurrentIP(self):
        print("\n\n"+">>> use proxy as follow: \n",self.gl_proxies)
        localIP = json.loads(requests.get("http://httpbin.org/ip",timeout=4).text.strip())
        print(">>> 本机ip:\n",localIP)
        try:
            proxyIP = json.loads(requests.get("http://httpbin.org/ip",proxies=self.gl_proxies,timeout=4).text.strip())
            print(">>> 代理ip:\n",proxyIP)
        except Exception as e:
            print(">>> 代理ip:\n",f"代理网络异常，Exception: {repr(e)}")
    
    # 如果已经获取过key，随机概率使用新key（避免行为过于规律被封）
    def getKey(self):
        if(self.key=="" or random.random()>=0.3):
            resp = requests.get("http://www.google.com")
            html = resp.text
            self.key = re.search(r"kEI:\'(.*?)\'", html).group(1)
        return self.key
    
    @staticmethod
    def getKey():
        resp = requests.get("http://www.google.com")
        html = resp.text
        key = re.search(r"kEI:\'(.*?)\'", html).group(1)
        return key
    
    def request(self,query,key,verbose=False):
        url = f"https://www.google.com/search?source=hp&ei={key}&q={query}"
        resp1 = requests.get(url,headers=self.header,proxies=self.gl_proxies,timeout=4)
        if(verbose):
            with open(self.verbosePath,"w+") as f: f.writelines(resp1.text)
        return resp1.text
    
    @staticmethod
    def _find_g(element):
        div_g = element.xpath("./div[@class='g']")
        if(len(div_g)==0):
            div_srg = element.xpath("./div[@class='srg']")
            if(len(div_srg)>0):
                div_g = div_srg[0].xpath("./div[@class='g']")
        return div_g
    
    @staticmethod
    def _find_result(element):
        a_el = element.xpath(".//div[@class='r']")[0].xpath("a")[0]
        title = a_el.xpath(".//h3[@class='LC20lb']")[0].text
        link = a_el.attrib["href"]
        summary = element.xpath(".//div[@class='s']")[0].xpath("string(.)")
        return [title,link,summary]

    @staticmethod
    def _parse(html_inp):
        html = etree.HTML(html_inp)
        # 查找所有class为g的div
        div_rso = html.xpath("//div[@id='rso']")
        div_bkWMgd = div_rso[0].xpath("./div[@class='bkWMgd']")
        div_g = []
        for el in div_bkWMgd:
            tmp = GoogleSearch._find_g(el)
            if(len(tmp)>0):
                div_g.extend(tmp)
        # 解析class为g的div
        result = []
        for i in div_g:
            try:
                result.append(GoogleSearch._find_result(i))
            except Exception as e:
                print(repr(e))
        return result

    def search(self,query,verbose=False):
        key = self.getKey()
        html_res = self.request(query,key,verbose)
        parse_res = GoogleSearch._parse(html_res)
        return parse_res

    def getResult_json(self,query,verbose=False):
        result = self.search(query,verbose)
        result_dictArr = [dict(zip(["title","link","summary","query"],i+[query])) for i in result]
        resultJSON = json.dumps({query:result_dictArr})
        return resultJSON


# WikiSearch 工具类

In [3]:
import wikipediaapi
class WikiSearch():
    def __init__(self):
        self.wiki_wiki = wikipediaapi.Wikipedia('en')
        
    def search(self,targetWord= "BJP"):
        page_py = self.wiki_wiki.page(targetWord)
        if page_py.exists():
            title = page_py.title
            url = page_py.canonicalurl
            cat = [i.title.split("Category:")[1] for i in page_py.categories.values()]
            status="success"
        else:
            title = ""
            url = ""
            cat = []
            status="fail"
            print("page not exists")
        return (title,url,cat,status)
    def getResult_json(self,query,verbose=False):
        (title,url,categories,status) = self.search(query)
        return json.dumps({"title":title,"url":url,"categories":categories,"status":status})
        

# Flask服务器

In [4]:
from flask import Flask,request,render_template
import json

g_searcher=GoogleSearch()
w_searcher = WikiSearch()

app = Flask(__name__,static_folder="/home/zhoutong",static_url_path="")
@app.route("/")
def index():
    return "index html page."
# POST | 如果是表单形式的
@app.route("/test", methods=['POST'])
def test():
    if request.method=="POST":
        request.form.get("abc","default") 
    return str(request.form.get("abc","default") )

# POST | 如果是json
@app.route("/test_json", methods=['POST'])
def test_json():
    if request.method=="POST":
        print(request.headers)
        data = request.get_data()
        json_result = json.loads(data)
        print(json_result)
    return str(json_result)

# GET | 解析参数 localhost:8080?params1=abc&params2=xyz
@app.route("/test_get",methods=['GET'])
def test_get():
    if request.method=="GET":
        print(request.headers)
        print(list(request.args.items()))
    return str(list(request.args.items()))

@app.route("/gsearch",methods=['GET'])
def g_search():
    q = request.args.get("query")
    if(q != None and len(q)>0):
        result = g_searcher.getResult_json(q,True)
        print(f"[query:] {q}\n[result_len:] {len(result)}\n[result_head100:] {result[:100]}")
        return result
    else:
        print("input param 'query' is empty.")
        return "input param 'query' is empty.",400

@app.route("/wsearch",methods=['GET'])    
def w_search():
    q = request.args.get("query")
    if(q != None and len(q)>0):
        result = w_searcher.getResult_json(q)
        print(f"[query:] {q}\n[result_len:] {len(result)}\n[result_head100:] {result[:100]}")
        return result
    else:
        print("input param 'query' is empty.")
        return "input param 'query' is empty.",400
    
app.run(host="0.0.0.0",port="12015")



 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://0.0.0.0:12015/ (Press CTRL+C to quit)
127.0.0.1 - - [04/Apr/2019 06:41:15] "GET /wsearch?query=BJP HTTP/1.1" 200 -


[query:] BJP
[result_len:] 1279
[result_head100:] {"title": "Bharatiya Janata Party", "url": "https://en.wikipedia.org/wiki/Bharatiya_Janata_Party", "
