# What to do when your code is slow

1. Make sure it works and then make it fast
2. Use profilers:
- https://julien.danjou.info/blog/2015/guide-to-python-profiling-cprofile-concrete-case-carbonara
3. Use multicore:
- https://pymotw.com/2/multiprocessing/basics.html

In [1]:
from multiprocessing import Pool
from time import sleep, clock, time

def slow_function(e):
    sleep(2)
    print("Finished",e)
    return e*10

# without
print('Normal loop')
elements = range(8)
time1 = time()
processed_elements = list(map(slow_function, elements))
time2 = time()
print('Took %0.3f s' % (time2-time1))
print(processed_elements)

# with
print('Fast loop')
pool = Pool(8)
elements = range(8)
time1 = time()
processed_elements = pool.map(slow_function, elements)
time2 = time()
print('Took %0.3f s' % (time2-time1))
pool.close()
print(processed_elements)

Normal loop
Finished 0
Finished 1
Finished 2
Finished 3
Finished 4
Finished 5
Finished 6
Finished 7
Took 16.018 s
[0, 10, 20, 30, 40, 50, 60, 70]
Fast loop
Finished 1
Finished 3
Finished 0
Finished 5
Finished 4
Finished 2
Finished 7
Finished 6
Took 2.024 s
[0, 10, 20, 30, 40, 50, 60, 70]


Exercise
-----------

1. Download the contents of the websites from `data/websites.csv` using `requests` module (`pip install requests`):
    - write a function that accepts URL as the parameter and returns the content of the website
2. How fast do you think you can download 2000 websites?

In [5]:
fpath = "data/websites.csv"
df = pd.read_csv(fpath)
#with open(fpath)
df

Unnamed: 0,url,label
0,printmanagement.com,1
1,gatorcountry.com,0
2,fordesign.com,1
3,mantrasds.com,1
4,lk-cs.com,0
5,7marketingmedia.com,1
6,knightagency.com,0
7,mediawhisper.com,1
8,jjjink.com,0
9,ocm.com,1


In [4]:
import requests

In [9]:
sdf = df.head()
sdf["url"]

0    printmanagement.com
1       gatorcountry.com
2          fordesign.com
3          mantrasds.com
4              lk-cs.com
Name: url, dtype: object

In [14]:
this_url = sdf["url"][0]
r = requests.get(r"http://"+this_url)

In [16]:
r.text

'<html><head><META HTTP-EQUIV="refresh" CONTENT="0;URL=/cgi-sys/defaultwebpage.cgi"></head><body></body></html>\n'

In [49]:
from lxml import etree, html

document_root = html.fromstring(r.text)
print(etree.tostring(document_root, encoding='unicode', pretty_print=True))
#print(etree.tostring(document_root,pretty_print=True))

<html>
  <head>
    <meta http-equiv="refresh" content="0;URL=/cgi-sys/defaultwebpage.cgi"/>
  </head>
  <body/>
</html>



In [29]:
elements = sdf["url"]
elements = list(elements)
elements

['printmanagement.com',
 'gatorcountry.com',
 'fordesign.com',
 'mantrasds.com',
 'lk-cs.com']

In [57]:
for i,this_m in enumerate(map(get_website, elements)):
    
    print(i,this_m)
    #print(etree.tostring(this_m, encoding='unicode', pretty_print=True))
    #print(len(this_m))
    #print(this_m[0])

0 <Element html at 0x7f28467476d8>
1 <Element html at 0x7f2846771098>
2 <Element html at 0x7f2846773818>
3 <Element html at 0x7f2846771098>
4 <Element html at 0x7f284674ebd8>


In [65]:
sdf2 = df[0:20]
elements = list(sdf2["url"])
elements[0:10]

['printmanagement.com',
 'gatorcountry.com',
 'fordesign.com',
 'mantrasds.com',
 'lk-cs.com',
 '7marketingmedia.com',
 'knightagency.com',
 'mediawhisper.com',
 'jjjink.com',
 'ocm.com']

In [73]:
from multiprocessing import Pool
from time import sleep, clock, time

def get_website_simple(url):
    r = requests.get(r"http://"+url)
    return r


def get_website(url):
    r = requests.get(r"http://"+url)
    document_root = html.fromstring(r.text)
    
    return document_root

# with
print('Fast loop')
print("running over",", ".join(elements))
pool = Pool(8)
#elements = range(8)
time1 = time()
#processed_elements = pool.map(get_website, elements)
processed_elements = pool.map(get_website_simple, elements)
time2 = time()
print('Took %0.3f s' % (time2-time1))
pool.close()
print(processed_elements)

Fast loop
running over printmanagement.com, gatorcountry.com, fordesign.com, mantrasds.com, lk-cs.com, 7marketingmedia.com, knightagency.com, mediawhisper.com, jjjink.com, ocm.com, email-ads.biz, appcogroupusa.com, brandcontent.com, lifemarketinginc.com, fishgroup.com.hk, pancom.com, consumercenters.com, lead411.com, fhrweb.com, knowledgehubnetworks.com
Took 8.418 s
[<Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, <Response [429]>, <Response [200]>, <Response [200]>, <Response [429]>, <Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, <Response [403]>, <Response [200]>]
