In [1]:
from selenium import webdriver
from PIL import Image

from pyvirtualdisplay import Display
from selenium.webdriver.chrome.service import Service

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time
import pandas as pd

from io import BytesIO

from selenium.webdriver.common.by import By

from pyvirtualdisplay import Display

from tqdm import tqdm



In [2]:
def highlight(driver, element, effect_time, color, border):
    """Highlights (blinks) a Selenium Webdriver element"""
    def apply_style(s):
        driver.execute_script("arguments[0].setAttribute('style', arguments[1]);",
                              element, s)
    original_style = element.get_attribute('style')
    apply_style(f"border: {border}px solid {color};")
    time.sleep(effect_time)

class Screenshotter():
    def __init__(self, save_dir):
        display = Display(visible=0, size=(1920, 1080))  
        display.start()

        self.service = Service('/home/veselovs/chromedriver')
        self.service.start()

        self.options = webdriver.ChromeOptions()
        self.driver = webdriver.Chrome(options = self.options)
        timeout = 25
        self.driver.set_page_load_timeout(timeout)
        
        self.save_dir = save_dir


    def extract_element(self, website_link, wiki):
        try:
            self.driver.get(website_link)
            time.sleep(3)
            element = self.driver.find_elements(By.XPATH, f'//a[contains(@href,"{wiki}")]')[0]
            parent = element.find_element(By.XPATH, "./..")
            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', inline: 'nearest'})", element)
            highlight(self.driver, element,5,"red",3)
            return element
        except:
            print("Didn't find element")
            parent = None
            return parent

    def take_screenshot(self,element, output_name):
        try:
            x = element.screenshot_as_png

            im = Image.open(BytesIO(x))
            im.save(self.save_dir + f'{output_name}.png')
#             self.driver.close()
            print("Saved image {}".format(output_name))
        except WebDriverException:
            print("Not wide enough")
            
        
    def iterate_over(self, url_links, wiki_links, output_names, element_screenshot = False):
        
        for url, wiki, output in tqdm(zip(url_links,wiki_links,output_names)):
            print(output)
            parent = self.extract_element(url, wiki)
            if parent != None:
                if element_screenshot == True:
                    self.take_screenshot(parent,output)
                else:
                    self.driver.save_screenshot(self.save_dir + f"{output}.png")
        self.driver.quit()

### Choose the subset of mentions we want to get links for. 

In [10]:
web_content = pd.read_csv("/scratch/venia/web2wiki/data/random_sample_html.csv").drop(columns = "Unnamed: 0")

In [None]:
web_content = pd.read_csv("/scratch/venia/web2wiki/data/1st_order.csv").drop(columns = "Unnamed: 0")

In [17]:
web_content["url"].iloc[0]

'http://simon.butcher.name/archives/2007/06/10/Coming-to-America'

In [5]:
web_contentt = web_content

In [6]:
url_links = web_contentt["url"].values
wiki_links = web_contentt["wiki_url"].values
names = web_contentt.index.values

In [266]:
sc = Screenshotter()
sc.iterate_over(url_links, wiki_links,names)

# Add text on image

In [367]:
from PIL import Image
from PIL import ImageFont
from PIL import ImageDraw, ImageOps 

IMG_DIR = "/scratch/venia/web2wiki/data/screenshots/1000_sample/"

def process_image(im_file, row, save_dir):
    image = Image.open(IMG_DIR + im_file + ".png")
    image = ImageOps.expand(image, border=50, fill=(255,255,255))

    draw = ImageDraw.Draw(image) 

    # specified font size
    font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 14) 

    text = f'url: {row["url"][0:80]}            \n parent 1: {row["a1"]}             parent 2: {row["a2"]}\
                     Wiki Title: {row["title"]}             Page rank: {row["#pr_pos"]}             Wiki links on URL: {row["num_wiki_on_url"]}'

    # drawing text size
    draw.text((0,0), text, font = font, align ="left",fill=(0,0,0)) 

    image.save(save_dir + im_file + "_clean.png")

In [341]:
import os
for (row_id, row) in web_contentt.iterrows():
    row_id = str(row_id)
    if os.path.exists(IMG_DIR + row_id + ".png"):
        process_image(row_id, row, IMG_DIR)


In [368]:
import os
IMG_DIR_no_wiki = "/scratch/venia/web2wiki/data/screenshots/1000_sample_no_wiki/"

for (row_id, row) in web_contentt.iterrows():
    row_id = str(row_id)
    if os.path.exists(IMG_DIR + row_id + ".png"):
        if not (("wiki" in row["url"]) | ("pedia" in row["url"])):
            process_image(row_id, row, IMG_DIR_no_wiki)


# Image categories

In [731]:
df = pd.read_parquet("/scratch/venia/web2wiki/data/web_content/iterative_coding_sample/tag_info.parquet")

In [732]:
df = df[df["wiki_links"] != "[]"]
for k in df.columns[1:]:
    df[k] = df[k].apply(lambda x: x[1:-1].split(", "))

cols = list(df.columns[1:])
df.drop_duplicates(subset = "url")


df.reset_index(drop = True,inplace =True)
df.index=df["url"]
df = df.drop(columns = "url")

In [761]:
df2 = pd.read_csv("/scratch/venia/web2wiki/data/tag_info_clean2.csv")

In [755]:
df2 = df2.sample(1000)

In [762]:
url_links = df2["url"]
wiki_links = df2["wiki_links"]
names = df2.index

In [None]:
sc = Screenshotter(save_dir = "/scratch/venia/web2wiki/data/screenshots/1000_sample2/")
sc.iterate_over(url_links, wiki_links,names)

0it [00:00, ?it/s]

0


1it [00:11, 11.26s/it]

1


2it [00:21, 10.69s/it]

2


3it [00:30, 10.07s/it]

3


4it [00:41, 10.27s/it]

4


5it [00:48,  9.07s/it]

Didn't find element
5


6it [00:57,  9.22s/it]

6


In [767]:
df2["0th_order"] = df2[["is_tag_footer","is_tag_header", "is_class_footer","is_class_header","is_class_sidebar"]].sum(axis=1)
df2["1st_order_evidence"] = df2[["is_tag_sup","is_tag_cite"]].sum(axis=1)

In [770]:
df2.to_csv("/scratch/venia/web2wiki/data/sample_compare.csv",index=False)

In [None]:
# CALCULATE THE DISTRIBUTION OVER 0TH ORDER 1ST ORDER 2ND ORDER

In [684]:
df2["is_more_than_40"] = df2["url_wiki_count"].apply(lambda x: 1 if x > 39 else 0)
df2["is_less_than_5"] = df2["url_wiki_count"].apply(lambda x: 1 if x < 5 else 0)


In [714]:
columns = ["is_blog","is_wiki","is_more_than_40","is_less_than_5","is_footer","is_header","is_tag_sup","is_class_comment","is_class_sidebar"]

df2["is_footer"] = df2.apply(lambda x: 1 if (x["is_tag_footer"] == 1) or (x["is_class_footer"] == 1) else 0,axis=1)
df2["is_header"] = df2.apply(lambda x: 1 if (x["is_tag_header"] == 1) or (x["is_class_header"] == 1) else 0,axis=1)


In [None]:

#### folders
# wiki or pedia in url
# is blog
# more than 30 mentions 
# less than 10 mentions
# is_tag or class footer
# is tag_sup
# is tag_cite
# is class_sidebar
# is class_comment




In [715]:
for col in columns[4:]:
    print(col)
    if not os.path.exists(f"/scratch/venia/web2wiki/data/screenshots/{col}/"):
        os.mkdir(f"/scratch/venia/web2wiki/data/screenshots/{col}/")
    sc = Screenshotter(save_dir = f"/scratch/venia/web2wiki/data/screenshots/{col}/")
    temp = df2[df2[col] == 1]
    temp = temp.drop_duplicates(subset = "url")
    if len(temp)> 100:
        temp = temp.sample(100)
    url_links = temp["url"].values
    wiki_links = temp["wiki_links"].values
    sc.iterate_over(url_links, wiki_links,names)
    

is_footer


0it [00:00, ?it/s]

0


1it [00:12, 12.81s/it]

1


2it [00:18,  8.64s/it]

Didn't find element
2


3it [00:30, 10.11s/it]

Didn't find element
3


4it [00:41, 10.51s/it]

4


5it [00:53, 10.94s/it]

5


6it [01:07, 11.99s/it]

6


7it [01:11,  9.39s/it]

Didn't find element
7


8it [01:21,  9.55s/it]

8


9it [01:24,  7.59s/it]

Didn't find element
9


10it [01:35,  8.63s/it]

10


11it [01:45,  8.99s/it]

11


12it [01:58, 10.29s/it]

12


13it [02:07, 10.04s/it]

13


14it [02:08,  7.10s/it]

Didn't find element
14


15it [02:16,  7.45s/it]

Didn't find element
15


16it [02:26,  8.30s/it]

16


17it [02:37,  8.94s/it]

17


18it [02:47,  9.44s/it]

18


19it [02:57,  9.65s/it]

19


20it [03:07,  9.58s/it]

20


21it [03:16,  9.46s/it]

21


22it [03:25,  9.34s/it]

22


23it [03:36,  9.89s/it]

23


24it [03:46,  9.93s/it]

24


25it [04:00, 11.06s/it]

25


26it [04:04,  9.08s/it]

Didn't find element
26


27it [04:11,  8.17s/it]

Didn't find element
27


28it [04:14,  6.88s/it]

Didn't find element
28


29it [04:25,  8.04s/it]

29


30it [04:35,  8.54s/it]

30


31it [04:46,  9.40s/it]

31


32it [04:50,  7.80s/it]

Didn't find element
32


33it [04:59,  8.08s/it]

33


34it [05:24, 13.16s/it]

Didn't find element
34


35it [05:28, 10.26s/it]

Didn't find element
35


36it [05:37, 10.13s/it]

36


37it [05:41,  8.06s/it]

Didn't find element
37


38it [05:56, 10.13s/it]

38


39it [05:59,  8.06s/it]

Didn't find element
39


40it [06:09,  8.61s/it]

40


41it [06:15,  7.85s/it]

Didn't find element
41


42it [06:21,  7.24s/it]

Didn't find element
42


43it [06:31,  8.05s/it]

43


44it [06:42,  9.01s/it]

44


45it [06:58, 11.22s/it]

45


46it [07:01,  8.82s/it]

Didn't find element
46


47it [07:06,  7.59s/it]

Didn't find element
47


48it [07:17,  8.69s/it]

48


49it [07:22,  7.55s/it]

Didn't find element
49


50it [07:31,  7.88s/it]

50


51it [07:41,  8.49s/it]

51


52it [08:01, 11.96s/it]

52


53it [08:01,  8.46s/it]

Didn't find element
53


54it [08:16, 10.37s/it]

54


55it [08:26, 10.16s/it]

55


56it [08:36, 10.20s/it]

56


57it [08:41,  8.64s/it]

Didn't find element
57


58it [08:51,  9.06s/it]

Didn't find element
58


59it [09:02,  9.60s/it]

59


60it [09:13, 10.06s/it]

60


61it [09:24, 10.27s/it]

61


62it [09:28,  8.55s/it]

Didn't find element
62


63it [09:40,  9.58s/it]

63


64it [09:43,  7.67s/it]

Didn't find element
64


65it [09:44,  5.45s/it]

Didn't find element
65


66it [09:55,  7.12s/it]

66


67it [10:05,  8.14s/it]

67


68it [10:24, 11.26s/it]

68


69it [10:30,  9.85s/it]

Didn't find element
69


70it [10:41,  9.95s/it]

70


71it [10:51, 10.15s/it]

71


72it [11:03, 10.57s/it]

72


73it [11:13, 10.34s/it]

73


74it [11:17,  8.54s/it]

Didn't find element
74


75it [11:20,  7.01s/it]

Didn't find element
75


76it [11:34,  9.08s/it]

76


77it [11:44,  9.23s/it]

77


78it [11:53,  9.36s/it]

78


79it [11:58,  7.80s/it]

Didn't find element
79


80it [12:15, 10.52s/it]

80


81it [12:27, 11.10s/it]

81


82it [12:39, 11.49s/it]

82


83it [12:49, 11.00s/it]

83


84it [12:49,  7.77s/it]

Didn't find element
84


85it [13:01,  9.01s/it]

85


86it [13:09,  8.54s/it]

Didn't find element
86


87it [13:12,  6.92s/it]

Didn't find element
87


88it [13:16,  6.14s/it]

Didn't find element
88


89it [13:30,  8.36s/it]

89


90it [13:35,  7.42s/it]

Didn't find element
90


91it [13:38,  6.19s/it]

Didn't find element
91


92it [13:48,  7.16s/it]

92


93it [13:57,  7.79s/it]

93


94it [14:08,  8.65s/it]

94


95it [14:11,  7.01s/it]

Didn't find element
95


96it [14:15,  6.26s/it]

Didn't find element
96


97it [14:26,  7.69s/it]

97


98it [14:38,  8.84s/it]

98


99it [14:50,  9.93s/it]

Didn't find element
99


100it [15:02,  9.03s/it]


is_header


0it [00:00, ?it/s]

0


1it [00:15, 15.30s/it]

1


2it [00:32, 16.17s/it]

2


3it [00:45, 14.86s/it]

3


4it [00:52, 11.92s/it]

Didn't find element
4


5it [01:10, 13.83s/it]

5


6it [01:35, 17.64s/it]

Didn't find element
6


7it [01:38, 13.09s/it]

Didn't find element
7


8it [01:44, 10.75s/it]

Didn't find element
8


9it [02:12, 16.02s/it]

9


10it [02:23, 14.44s/it]

10


11it [02:33, 13.29s/it]

11


12it [02:37, 10.40s/it]

Didn't find element
12


13it [02:41,  8.35s/it]

Didn't find element
13


14it [02:52,  9.35s/it]

14


15it [02:56,  7.70s/it]

Didn't find element
15


16it [03:03,  7.49s/it]

Didn't find element
16


17it [03:12,  8.01s/it]

17


18it [03:28, 10.34s/it]

18


19it [03:40, 10.74s/it]

19


20it [03:51, 10.85s/it]

20


21it [04:03, 11.13s/it]

21


22it [04:13, 11.02s/it]

22


23it [04:20,  9.62s/it]

Didn't find element
23


24it [04:30,  9.81s/it]

24


25it [04:42, 10.30s/it]

25


27it [04:46, 10.61s/it]

Didn't find element
26
Didn't find element
is_tag_sup



0it [00:00, ?it/s]

0


1it [00:13, 13.20s/it]

1


2it [00:23, 11.41s/it]

2


3it [00:36, 12.06s/it]

3


4it [00:46, 11.34s/it]

4


5it [00:56, 10.95s/it]

5


6it [01:06, 10.48s/it]

6


7it [01:15, 10.06s/it]

7


8it [01:25, 10.08s/it]

8


9it [01:38, 10.91s/it]

9


10it [01:48, 10.77s/it]

10


11it [02:01, 11.52s/it]

11


12it [02:13, 11.59s/it]

12


13it [02:23, 11.05s/it]

13


14it [02:33, 10.61s/it]

14


15it [02:58, 14.96s/it]

Didn't find element
15


16it [03:15, 15.68s/it]

16


17it [03:29, 15.10s/it]

17


18it [03:47, 15.90s/it]

18


19it [03:57, 14.29s/it]

19


20it [04:00, 10.94s/it]

Didn't find element
20


21it [04:05,  9.17s/it]

Didn't find element
21


22it [04:15, 11.63s/it]


is_tag_sup


0it [00:00, ?it/s]

0


1it [00:12, 12.02s/it]

1


2it [00:21, 10.26s/it]

2


3it [00:33, 11.18s/it]

3


4it [00:43, 10.81s/it]

4


5it [00:53, 10.54s/it]

5


6it [01:03, 10.16s/it]

6


7it [01:12,  9.79s/it]

7


8it [01:21,  9.81s/it]

8


9it [01:34, 10.73s/it]

9


10it [01:45, 10.62s/it]

10


11it [01:58, 11.40s/it]

11


12it [02:09, 11.48s/it]

12


13it [02:19, 10.94s/it]

13


14it [02:29, 10.56s/it]

14


15it [02:54, 14.92s/it]

Didn't find element
15


16it [03:10, 15.23s/it]

16


17it [03:24, 14.81s/it]

17


18it [03:35, 13.92s/it]

18


19it [03:45, 12.74s/it]

19


20it [03:49,  9.85s/it]

Didn't find element
20


21it [03:54,  8.43s/it]

Didn't find element
21


22it [04:04, 11.09s/it]


is_class_comment


0it [00:00, ?it/s]

0


1it [00:10, 10.43s/it]

1


2it [00:20, 10.32s/it]

2


3it [00:29,  9.86s/it]

3


4it [00:44, 11.51s/it]

4


5it [01:09, 16.38s/it]

Didn't find element
5


6it [01:15, 12.92s/it]

Didn't find element
6


7it [01:40, 16.88s/it]

Didn't find element
7


8it [01:54, 15.93s/it]

8


9it [02:19, 18.78s/it]

Didn't find element
9


10it [02:24, 14.76s/it]

Didn't find element
10


11it [1:12:28, 1296.62s/it]

Didn't find element
11


12it [1:12:39, 905.64s/it] 

12


13it [1:12:46, 633.54s/it]

Didn't find element
13


14it [1:12:58, 445.77s/it]

14


15it [1:13:08, 314.36s/it]

15


16it [1:13:15, 221.91s/it]

Didn't find element
16


17it [1:13:21, 156.81s/it]

Didn't find element
17


18it [1:13:32, 113.14s/it]

18


19it [1:13:50, 84.52s/it] 

19


20it [1:13:59, 61.80s/it]

20


21it [1:14:17, 48.77s/it]

21


22it [1:14:22, 35.55s/it]

Didn't find element
22


23it [1:14:34, 28.39s/it]

23


24it [1:14:58, 27.03s/it]

24


25it [1:15:08, 22.19s/it]

25


26it [1:15:09, 15.60s/it]

Didn't find element
26


27it [1:15:12, 12.04s/it]

Didn't find element
27


28it [1:15:30, 13.57s/it]

28


29it [1:15:34, 10.94s/it]

Didn't find element
29


30it [1:15:48, 11.63s/it]

30


31it [1:15:59, 11.55s/it]

31


32it [1:16:03,  9.17s/it]

Didn't find element
32


33it [1:16:14,  9.75s/it]

33


34it [1:16:18,  8.07s/it]

Didn't find element
34


35it [1:16:29,  9.09s/it]

35


36it [1:16:57, 14.67s/it]

36


37it [1:17:32, 20.67s/it]

37


38it [1:17:45, 18.54s/it]

38


39it [1:17:46, 13.11s/it]

Didn't find element
39


40it [1:17:56, 12.26s/it]

40


41it [1:18:05, 11.34s/it]

41


42it [1:18:15, 11.00s/it]

42


43it [1:18:27, 11.14s/it]

43


44it [1:18:40, 11.66s/it]

44


45it [1:18:50, 11.11s/it]

45


46it [1:18:59, 10.47s/it]

46


47it [1:19:10, 10.70s/it]

47


48it [1:19:25, 12.08s/it]

48


49it [1:19:37, 12.10s/it]

49


50it [1:19:47, 11.54s/it]

50


51it [1:20:00, 11.77s/it]

51


52it [1:20:10, 11.40s/it]

52


53it [1:20:22, 11.65s/it]

53


54it [1:20:34, 11.48s/it]

54


55it [1:20:49, 12.64s/it]

55


56it [1:21:00, 12.18s/it]

56


57it [1:21:11, 11.76s/it]

57


58it [1:21:23, 11.93s/it]

58


67it [1:23:26, 13.91s/it]

67


68it [1:23:36, 12.59s/it]

68


69it [1:23:47, 12.05s/it]

69


70it [1:24:00, 12.49s/it]

70


71it [1:24:11, 11.86s/it]

71


72it [1:24:22, 11.81s/it]

72


73it [1:24:33, 11.43s/it]

73


74it [1:24:43, 10.97s/it]

74


75it [1:24:53, 10.76s/it]

75


76it [1:25:03, 10.46s/it]

76


77it [1:25:12, 10.20s/it]

77


78it [1:25:22, 10.04s/it]

78


79it [1:25:23,  7.25s/it]

Didn't find element
79


80it [1:25:33,  8.09s/it]

80


81it [1:25:44,  9.00s/it]

81


82it [1:25:54,  9.17s/it]

82


83it [1:25:59,  8.21s/it]

Didn't find element
83


84it [1:26:12,  9.50s/it]

84


85it [1:26:22,  9.75s/it]

85


86it [1:26:41, 12.30s/it]

86


87it [1:26:54, 12.66s/it]

87


88it [1:27:06, 12.52s/it]

88


89it [1:27:16, 11.59s/it]

89


90it [1:27:16,  8.30s/it]

Didn't find element
90


91it [1:27:33, 10.82s/it]

91


92it [1:27:43, 10.66s/it]

92


93it [1:27:53, 10.42s/it]

93


94it [1:28:18, 14.80s/it]

Didn't find element
94


95it [1:28:30, 13.94s/it]

95


96it [1:28:50, 15.67s/it]

96


97it [1:29:00, 14.14s/it]

97


98it [1:29:10, 12.88s/it]

98


99it [1:29:22, 12.41s/it]

99


100it [1:29:33, 53.74s/it]


is_class_sidebar


0it [00:00, ?it/s]

0


1it [00:10, 10.50s/it]

1


2it [00:20, 10.41s/it]

2


3it [00:34, 12.06s/it]

3


4it [00:59, 17.18s/it]

Didn't find element
4


5it [01:13, 15.73s/it]

5


6it [01:20, 12.92s/it]

Didn't find element
6


7it [01:40, 15.30s/it]

7


8it [01:41, 10.53s/it]

Didn't find element
8


9it [01:51, 10.45s/it]

9


10it [02:00, 10.20s/it]

10


11it [02:14, 11.14s/it]

11


12it [02:24, 10.98s/it]

12


13it [02:35, 10.89s/it]

13


14it [02:48, 11.59s/it]

14


15it [02:55, 10.17s/it]

Didn't find element
15


16it [03:05, 10.19s/it]

16


17it [03:15, 10.17s/it]

17


18it [03:25,  9.90s/it]

18


19it [03:38, 10.84s/it]

19


20it [03:50, 11.27s/it]

20


21it [04:05, 12.39s/it]

21


22it [04:18, 12.60s/it]

22


23it [04:27, 11.60s/it]

23


24it [04:36, 10.71s/it]

24


25it [04:45, 10.09s/it]

25


26it [04:55, 10.02s/it]

26


27it [05:04, 10.00s/it]

27


28it [05:14,  9.99s/it]

Didn't find element
28


29it [05:25, 10.02s/it]

29


30it [05:35, 10.30s/it]

30


31it [05:39,  8.32s/it]

Didn't find element
31


32it [05:55, 10.59s/it]

32


33it [06:08, 11.23s/it]

33


34it [06:19, 11.29s/it]

34


35it [06:30, 11.06s/it]

35


36it [06:41, 11.14s/it]

36


37it [06:56, 12.14s/it]

37


38it [07:21, 16.00s/it]

Didn't find element
38


39it [07:31, 14.32s/it]

39


40it [07:41, 12.96s/it]

40


41it [07:50, 11.94s/it]

41


42it [08:00, 11.12s/it]

42


43it [08:14, 11.99s/it]

43


44it [08:26, 12.04s/it]

44


45it [08:36, 11.50s/it]

45


46it [08:46, 10.95s/it]

46


47it [08:58, 11.47s/it]

Didn't find element
47


48it [09:10, 11.55s/it]

48


49it [09:20, 10.98s/it]

49


50it [09:29, 10.53s/it]

50


51it [09:40, 10.60s/it]

51


52it [09:50, 10.30s/it]

52


53it [09:59, 10.09s/it]

53


54it [10:09,  9.98s/it]

54


55it [10:13,  8.28s/it]

Didn't find element
55


56it [10:18,  7.13s/it]

Didn't find element
56


57it [10:23,  6.53s/it]

Didn't find element
57


58it [10:32,  7.48s/it]

58


59it [10:42,  8.03s/it]

59


60it [10:56,  9.81s/it]

60


61it [11:07, 10.39s/it]

61


62it [11:18, 10.30s/it]

62


63it [11:24,  9.18s/it]

Didn't find element
63


64it [11:37, 10.19s/it]

64


65it [11:46,  9.81s/it]

Didn't find element
65


66it [12:03, 11.96s/it]

66


67it [12:19, 13.42s/it]

Didn't find element
67


68it [12:32, 13.07s/it]

68


69it [12:44, 12.82s/it]

69


70it [12:53, 11.84s/it]

70


71it [13:00, 10.11s/it]

Didn't find element
71


72it [13:10, 10.36s/it]

72


73it [13:22, 10.58s/it]

73


74it [13:26,  8.67s/it]

Didn't find element
74


75it [13:39, 10.06s/it]

75


76it [13:45,  8.77s/it]

Didn't find element
76


77it [13:55,  9.26s/it]

77


78it [13:59,  7.74s/it]

Didn't find element
78


79it [14:11,  8.74s/it]

79


80it [14:20,  9.10s/it]

80


82it [14:30,  6.48s/it]

81
Didn't find element
82


83it [14:39, 10.60s/it]


In [None]:
def process_image2(im_file, row, save_dir):
    IMG_DIR = "/scratch/venia/web2wiki/data/screenshots/"
    image = Image.open(save_dir+im_file + ".png")
    image = ImageOps.expand(image, border=50, fill=(255,255,255))

    draw = ImageDraw.Draw(image) 

    # specified font size
    font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 14) 

    text = f'url: {row["url"][0:80]}            \n Wiki link: {row["wiki_links"]}             is_wiki: {row["is_wiki"]}\
                     is_blog: {row["is_blog"]}         num_pages: {row["url_wiki_count"]}'

    # drawing text size
    draw.text((0,0), text, font = font, align ="left",fill=(0,0,0)) 

    image.save(save_dir + im_file + "_clean.png")