In [2]:
import requests
import pandas as pd

In [3]:
# simple request on webpage
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [4]:
print(page.content)

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'


In [5]:
#formating 
page.text.split("\n")

['<!DOCTYPE html>',
 '<html>',
 '    <head>',
 '        <title>A simple example page</title>',
 '    </head>',
 '    <body>',
 '        <p>Here is some simple content for this page.</p>',
 '    </body>',
 '</html>']

In [6]:
# Attempt to read from a file
try:
    with open('example.txt', 'r') as file:
        content = file.read()
        print(content)
except FileNotFoundError:
    # If the file does not exist, create it and write a default message
    with open('example.txt', 'w') as file:
        file.write("This is a new file.")
        print("File 'example.txt' was not found and has been created.")


File 'example.txt' was not found and has been created.


In [7]:
with open('example.txt', 'r') as file:
    content = file.read()
    print(content)

This is a new file.


In [10]:
import os
#list all files in a directory
for file in os.listdir(r'C:\Users\bertr\Documents\h3hitema\M1\python_web_scrapping'):
    print(file)

.ipynb_checkpoints
example.txt
scrapping_data_with_python.ipynb


In [11]:
with open('example.txt', 'r', encoding='utf-8') as file:
    content = file.read()
    print(content)

This is a new file.


In [13]:
import json 

data = [
    {
        "name": "Alice Brown",
        "department": "Marketing",
        "salary": 70000
    },
    {
        "name": "Bob Smith",
        "department": "Sales",
        "salary": 65000
    },
    {
        "name": "Carol Jones",
        "department": "IT",
        "salary": 75000
    }
]

#write this variable inside a json file
with open('output.json', 'w') as file:
    json.dump(data, file, indent=4)

In [14]:
#then read the data
with open('output.json', 'r') as file:
    data = json.load(file)
    print(data)

[{'name': 'Alice Brown', 'department': 'Marketing', 'salary': 70000}, {'name': 'Bob Smith', 'department': 'Sales', 'salary': 65000}, {'name': 'Carol Jones', 'department': 'IT', 'salary': 75000}]


In [15]:
import time
import requests
import csv
from io import StringIO

In [16]:
%time
def csv_reader(file_content):
    return csv.reader(StringIO(file_content))

# Fetch the file content from the URL
url = 'https://gist.githubusercontent.com/bdallard/d4a3e247e8a739a329fd518c0860f8a8/raw/82fb43adc5ce022797a5df21eb06dd8e755145ea/data-json.csv'
response = requests.get(url)
file_content = response.text

tmp=0
start_time = time.time()
csv_data = csv_reader(file_content)
for row in csv_data:
    tmp+=int(row[0][-1]) #some dummy operation
end_time = time.time()

print("Traditional approach took:", end_time - start_time, "seconds")

CPU times: total: 0 ns
Wall time: 0 ns
Traditional approach took: 0.015854358673095703 seconds


In [17]:
%time
def csv_reader_gen(file_content):
    for row in csv.reader(StringIO(file_content)):
        yield row

# Fetch the file content from the URL
url = "https://gist.githubusercontent.com/bdallard/d4a3e247e8a739a329fd518c0860f8a8/raw/82fb43adc5ce022797a5df21eb06dd8e755145ea/data-json.csv"
response = requests.get(url)
file_content = response.text

tmp=0
start_time = time.time()
csv_gen = csv_reader_gen(file_content)
for row in csv_gen:
    tmp+=int(row[0][-1]) #some dummy operation
end_time = time.time()

print("Generator approach took:", end_time - start_time, "seconds")

CPU times: total: 0 ns
Wall time: 0 ns
Generator approach took: 0.01596522331237793 seconds


In [18]:
response = requests.get('http://httpbin.org/ip') 
#print(response.json()['origin']) #your personnal ip

In [20]:
!pip install free-proxy

Collecting free-proxy
  Downloading free_proxy-1.1.1.tar.gz (5.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: free-proxy
  Building wheel for free-proxy (setup.py): started
  Building wheel for free-proxy (setup.py): finished with status 'done'
  Created wheel for free-proxy: filename=free_proxy-1.1.1-py3-none-any.whl size=5663 sha256=b4aacfaa4fdca28babed3740ef843102156f8aa4e1377b45c654768a5f358d84
  Stored in directory: c:\users\bertr\appdata\local\pip\cache\wheels\c6\7f\3f\b764995ae2502d8642977764577198043d3b6c6738534f5ffe
Successfully built free-proxy
Installing collected packages: free-proxy
Successfully installed free-proxy-1.1.1


In [21]:
from fp.fp import FreeProxy
import requests
from bs4 import BeautifulSoup

In [23]:
proxy = FreeProxy(country_id=['FR']).get(); proxy

'http://82.64.77.30:80'

In [24]:
proxy_list = [FreeProxy(country_id=['FR']).get() for x in range(3)]; proxy_list

['http://159.65.77.168:8585',
 'http://66.45.246.194:8888',
 'http://51.89.14.70:80']

In [26]:
proxies = {'http': proxy_list[1]} 
response = requests.get('http://httpbin.org/ip', proxies=proxies) 
print(response.json()['origin']) # our proxy !!

66.45.246.194


In [27]:
response = requests.get('http://httpbin.org/headers') 
print(response.json()['headers'])

{'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-66014e54-1704ed6638ecc9b338673c66'}


In [28]:
!curl http://httpbin.org/headers

{
  "headers": {
    "Accept": "*/*", 
    "Host": "httpbin.org", 
    "User-Agent": "curl/8.1.1", 
    "X-Amzn-Trace-Id": "Root=1-66014e68-41ba3dbd18b865dc4bc96c77"
  }
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   172  100   172    0     0    660      0 --:--:-- --:--:-- --:--:--   666


In [29]:
#try a custom user-agent
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"} 
response = requests.get('http://httpbin.org/headers', headers=headers) 
print(response.json()['headers']['User-Agent']) # Mozilla/5.0 ...

Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36


In [30]:
#more user-agent, thanks chatgpt 🤓
import random
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
    'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (iPad; CPU OS 13_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Mobile/15E148 Safari/604.1',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/604.1.34 (KHTML, like Gecko) Edge/90.0.818.56',
    'Mozilla/5.0 (Linux; Android 10; SM-A505FN) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Mobile Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
    'Mozilla/5.0 (Linux; Android 11; Pixel 3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Mobile Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'
]

user_agent = random.choice(user_agents) 
headers = {'User-Agent': user_agent} 
response = requests.get('https://httpbin.org/headers', headers=headers) 
print(response.json()['headers']['User-Agent']) 
# Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) ...

Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/604.1.34 (KHTML, like Gecko) Edge/90.0.818.56


In [31]:
headers_list = [
    {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9",
        "Host": "httpbin.org",
        "Sec-Ch-Ua": "\"Chromium\";v=\"92\", \" Not A;Brand\";v=\"99\", \"Google Chrome\";v=\"92\"",
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    },
    {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.5",
        "Host": "httpbin.org",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0"
    },
    {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.5",
        "Host": "httpbin.org",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
    },
    {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-GB,en;q=0.5",
        "Host": "httpbin.org",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0"
    },
    {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9",
        "Host": "httpbin.org",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15"
    }
]

headers = random.choice(headers_list) 
response = requests.get('https://httpbin.org/headers', headers=headers, proxies=proxies) 
print(response.json()['headers'])

{'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.5', 'Host': 'httpbin.org', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1', 'X-Amzn-Trace-Id': 'Root=1-66014ebd-7dfee7171714fb801716d755'}


In [32]:
import threading

#define a function that must be executed using threads
def thread_function(name):
    print("Hello from thread", name)

#create a thread and execute the function
thread = threading.Thread(target=thread_function, args=("Thread 1",)) #create a thread taking our desired function as arguement
thread.start()
thread.join() #thread.start() starts the thread and thread.join() stops the thread

Hello from thread Thread 1
