A tutorial for website scrapping using the Requests and BeautifullSoup libraries. 

In [1]:
import requests # To be able to access websites we wish to scrap.
from bs4 import BeautifulSoup
import csv
import pandas as pd

In [2]:
url='http://ensias.um5.ac.ma/page/ing%C3%A9nieurs' # We will extract names of Allumni from my own school ENSIAS for the 2015 promo.

In [3]:
req=requests.get(url) 
status=req.status_code # Status of the connection attempt if everything is good this should have a 200 value.
encoding=req.encoding
text=BeautifulSoup(req.text,'html.parser')
print(status)

200


In [5]:
data=text.findAll('tr') # Finding all tags with 'tr' in them
for item in data:
    pass
item # We only catch the last one to see how it looks like.

<tr height="20"><td align="right" height="20" style="height:20px;">183</td>
<td align="left">ZRAIBI</td>
<td align="left">SALMA</td>
</tr>

In [14]:
a=item.contents # Turns the response obtained into a list
a

[<td align="right" height="20" style="height:20px;">183</td>,
 '\n',
 <td align="left">ZRAIBI</td>,
 '\n',
 <td align="left">SALMA</td>,
 '\n']

In [23]:
print("List of students promo 2015: \n")
print(f"{'First name':<20}{'Last name':<15}\n")
for item in data:
    a=item.contents
    print(f"{a[2].text:<20}{a[4].text:<15}") # .text method returns the contents of the tag as a string

List of students promo 2015: 

First name          Last name      

ABI                 YASSIR         
ABIDALLAH           YOUSSEF        
ACHAOUD             HANANE         
ADDAD               ABDELHALIM     
AFIF                MOHAMMED       
AIT EL HAJ          KHADIJA        
AIT EL HARRAJ       AMINE          
AIT MANSOUR         YOUSSEF        
AITOUNA             MOHAMED        
AL MAACH            WAHIBA         
ALAOUI MRANI        SOUKAINA       
ALLAOU              MARIAME        
ALMAMOUN            ZAKARIAE       
AMARA               HALA           
AMCHICH             IMANE          
AMENCHAR            NABIL          
AMMARI              KHALID         
AMNAS               ASMAE          
AOULADLAHCEN        MOHAMMED       
AOUNI               HAMZA          
AOURAGH             YOUNESSE       
AOUTIL              AHMED          
AQQA                MILOUD         
BAHADOU             HIND           
BARBARE             MOHAMMED AMINE 
BELAKHAL            HAMZA       

Now for another task, we will now try to retrieve images too. We will be using for this task Jumia.

In [24]:
url='https://www.jumia.com.ng/laptops/'

In [25]:
# Same story here for establishing connection and getting the response.
req=requests.get(url)
status=req.status_code
encoding=req.encoding
text=BeautifulSoup(req.text,'html.parser')
print(status)

200


In [70]:
# By inspecting the web html code, we can know the name of tags for which we are interested for the scraping.
# In this case the <a> tag that has a 'link' class is the one containing the product data, image, name, brand, etc..
# So that's what we will look up using findAll. We will work on the last one as an exhibit to know its structure.
data=text.findAll('a',{'class':'link'})
for item in data:
    pass
item

<a class="link" href="https://www.jumia.com.ng/envy-13-8th-gen-intel-core-i7-1.8-up-to-4.0ghz-8gb256gb-ssd-1tb-external-hdd13.3-inch-wins-10-hp-mpg340803.html"> <div class="top"> </div> <div class="image-wrapper default-state"><noscript><img class="image" height="220" src="https://ng.jumia.is/ZvPUy7w_tDOu6EzuF01LXxJgs0g=/fit-in/220x220/filters:fill(white):sharpen(1,0,false):quality(100)/product/57/428872/1.jpg?5409" width="220"/></noscript></div> <h2 class="title"><span class="brand">Hp </span> <span clas

In [86]:
brand=item.find_all('span',{'class':'brand'})[0].text.strip()
name=item.find_all('span',{'class':'name'})[0].text.strip()
price=item.find_all('span',{'class':'price'})[0].text.strip()
print('Mark: ',brand,'\nNom: ',name,'\nPrix: ',price)

Mark:  Hp 
Nom:  Envy 13 8th Gen Intel Core I7-1.8 Up To 4.0Ghz (8GB,256GB SSD + 1TB External HDD)13.3-inch Wins 10 
Prix:  ₦ 380,000


In [93]:
img=item.find('img',{'class':'lazy image'})
img_src=img.get('data-src')
img_src

'https://ng.jumia.is/ZvPUy7w_tDOu6EzuF01LXxJgs0g=/fit-in/220x220/filters:fill(white):sharpen(1,0,false):quality(100)/product/57/428872/1.jpg?5409'

In [103]:
def get_img(url_img,id):
    r=requests.get(url_img)
    if r.status_code==200:
        #img = Image.open(BytesIO(r.content))
        img_container="E:/"+str(id)+'.jpg'
        with open(img_container ,'wb') as f:
            f.write(r.content) 

In [104]:
get_img(img_src,1)

![](1.jpg)

In [150]:
DataFrame=pd.DataFrame(columns=['Lang', 'Num Artic'])

In [113]:
url='https://www.wikipedia.org/'

In [114]:
req=requests.get(url)
status=req.status_code
encoding=req.encoding
text=BeautifulSoup(req.text,'html.parser')
print(status)

200


In [140]:
data=text.findAll('div',{'class':'central-featured'})
for item in data:
    pass
item.contents[].findAll('bdi')[0].text

'5\xa0935\xa0000+'

In [153]:
i=7
while i<44:
    l=[item.contents[i].findAll('strong')[0].text.strip(),item.contents[i].findAll('bdi')[0].text.strip()]
    DataFrame.loc[i-7]=l
    i+=4

In [154]:
DataFrame

Unnamed: 0,Lang,Num Artic
0,English,5 935 000+
4,EspaÃ±ol,1 546 000+
8,æ¥æ¬èª,1 169 000+
12,Deutsch,2 345 000+
16,Ð ÑÑÑÐºÐ¸Ð¹,1 569 000+
20,FranÃ§ais,2 141 000+
24,Italiano,1 554 000+
28,ä¸­æ,1 074 000+
32,PortuguÃªs,1 014 000+
36,Polski,1 360 000+


In [20]:
def getData(text):
    for item in text.findAll('td'):
        yield item

In [21]:
i=0
for item in getData(text):
    if i<3:
        pas
        print(item)
        i+=1
    else:
        break

<td align="right" height="20" style="height:20px;width:87px;">1</td>
<td align="left" style="width:157px;"><span data-scayt_word="ABI" data-scaytid="1">ABI</span></td>
<td align="left" style="width:143px;"><span data-scayt_word="YASSIR" data-scaytid="2">YASSIR</span></td>
