### Exercice 2: _Extraction de Données en Ligne_
**Objectif:** Ecrivons les scripts d'extraction de données depuis différentes plateformes.

In [7]:
from urllib.request import urlopen
from requests import get
from bs4 import BeautifulSoup
from dotenv import dotenv_values
import pandas as pd
from googleapiclient.discovery import build
from googlesearch import search
import praw
import wikipediaapi

config = dotenv_values(".env")

1. **Amazon:** Web scraping avec BeautifulSoup: Produits, prix, avis

In [2]:
url = "https://www.amazon.com/dp/B0CP22DQQS?th=1"
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

In [3]:
title = soup.find(id="productTitle").get_text(strip=True)
price = soup.find('span', {'class': 'a-price'}).get_text(strip=True).split('$')[1]
rating = soup.find(class_="a-icon-alt").get_text()
print(f"Produit: {title}\nPrix: {price}$\nAvis: {rating}")

Produit: Marsail Ergonomic Office Chair: Office Computer Desk Chair with High Back Mesh and Adjustable Lumbar Support Rolling Work Swivel Task Chairs with Wheel 3D Armrests and Headrest
Prix: 105.58$
Avis: 4.4 out of 5 stars


 2. **Twitter:** Utilisation de l'API Twitter v2: Tweets, likes, retweets

In [4]:
x_url="https://api.twitter.com/2"

X_BEARER_TOKEN = config.get("X_BEARER_TOKEN")

headers = {
  'Authorization': f"Bearer {X_BEARER_TOKEN}"
}

params = {
    'tweet.fields': 'created_at,public_metrics',
}

if not X_BEARER_TOKEN:
   raise Exception("Bearer token non trouvé. Merci de fournir un valeur à la variable d'environnement X_BEARER_TOKEN.")

def get_tweets(username: str):
    response = get(f"{x_url}/tweets/search/recent?query=from:{username}", headers=headers, params=params)
    if response.status_code != 200:
        raise Exception(f"Erreur {response.status_code}: {response.text}")
    data = response.json()
    return data.get("data")

df = pd.DataFrame(get_tweets("DylanCalluy"))
print(df.head())

Exception: Erreur 429: {"title":"Too Many Requests","detail":"Too Many Requests","type":"about:blank","status":429}

 3. **Instagram:** API Instagram Graph: Captions, likes, images

4. **YouTube:** API YouTube Data: Titres, vues, commentaires

In [9]:
YOUTUBE_API_KEY = config.get("YOUTUBE_API_KEY")
if not YOUTUBE_API_KEY:
   raise Exception("API Key non trouvé. Merci de fournir un valeur à la variable d'environnement YOUTUBE_API_KEY.")

youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

def get_videos(channel_id: str):
    request = youtube.search().list(part="snippet", channelId=channel_id, maxResults=10)
    response = request.execute()
    videos = []
    for item in response['items']:
        if item['id']['kind'] == 'youtube#video':
            video_id = item['id']['videoId']
            video_details = youtube.videos().list(part="snippet,statistics", id=video_id).execute()
            video_info = video_details['items'][0]
            comments_request = youtube.commentThreads().list(part="snippet", videoId=video_id, maxResults=10)
            comments_response = comments_request.execute()
            comments = [comment['snippet']['topLevelComment']['snippet']['textDisplay'] for comment in comments_response['items']]
            video_data = {
                'title': video_info['snippet']['title'],
                'views': video_info['statistics']['viewCount'],
                'comments': comments
            }
            videos.append(video_data)
    return videos

videos = get_videos("UCWedHS9qKebauVIK2J7383g")
df = pd.DataFrame(videos)
print(df.head())

                                               title    views  \
0  Vidéo complète sur la chaîne ! Backstage de no...    13627   
1  Vidéo complète sur la chaîne ! Q&A exclusive :...  1629471   
2            Le rôle abject d’un flic sur le Darkweb   825384   
3  La trouvaille scandaleuse d'un hacker sur un d...   383540   
4  La trouvaille scandaleuse d'un physicien sur u...   607457   

                                            comments  
0  [Le futur hugo décrypte ? 😉😉, il a un appareil...  
1  [Formé à l’esprit critique mais bien encadré p...  
2  [Christophe coulons, Il a décroché le job car ...  
3  [Depuis quand tu prends autant de drogue d’un ...  
4  [Super vidéo comme d&#39;habitude mais je me p...  


 5. **Google Search:** Scraper avec googlesearch (Résultats de recherche)

In [2]:
def google_search(query, num_results=10):
    search_results = []
    for result in search(query, num_results=num_results):
        search_results.append(result)
    return search_results

query = 'Data Science Course site:*.edu filetype:pdf intext:"Book"'
results = google_search(query)
df = pd.DataFrame(results)
print(df.head())

                                                   0
0            https://www.cs.cornell.edu/jeh/book.pdf
1  https://www.webpages.uidaho.edu/~stevel/517/Th...
2  https://cims.nyu.edu/~cfgranda/pages/stuff/pro...
3  https://digital.library.ncat.edu/cgi/viewconte...
4  https://conocer.cide.edu/default.aspx/libweb/4...


6. **Reddit:** API Reddit (PRAW). (Posts, votes, commentaires)

In [3]:
reddit = praw.Reddit(
    client_id=config.get("REDDIT_CLIENT_ID"),
    client_secret=config.get("REDDIT_CLIENT_SECRET"),
    user_agent=config.get("REDDIT_USER_AGENT")
)

def get_reddit_posts(subreddit_name, limit=10):
    subreddit = reddit.subreddit(subreddit_name)
    posts = []
    for post in subreddit.hot(limit=limit):
        post_data = {
            'title': post.title,
            'score': post.score,
            'num_comments': post.num_comments,
            'comments': [comment.body for comment in post.comments[:10]]
        }
        posts.append(post_data)
    return posts

subreddit_name = "learnpython"
posts = get_reddit_posts(subreddit_name)
df = pd.DataFrame(posts)
print(df.head())

                                          title  score  num_comments  \
0           Ask Anything Monday - Weekly Thread     10            28   
1           How do you actually learn by doing?    101            49   
2         Build a to-do-list program (beginner)      7             2   
3  Why are some types made immutable in Python?      2             0   
4  What’s the best application to learn python?      5            17   

                                            comments  
0  [Have you ever developed a "great new idea" in...  
1  [As someone who has used Python for more than ...  
2  [Next improvement would be to save & load list...  
3                                                 []  
4  [I liked the 100 days of code course on udemy,...  


7. **Wikipedia:** API Wikipédia (Contenu d'articles).

In [14]:
def get_wikipedia_content(page_name):
    wiki_wiki = wikipediaapi.Wikipedia(user_agent='data_science_exo')
    page = wiki_wiki.page(page_name)
    if page.exists():
        return {
            'title': page.title,
            'summary': page.summary,
            'full_content': page.text
        }
    else:
        return None

page_name = "Dunning–Kruger_effect"
content = get_wikipedia_content(page_name)
if content:
    df = pd.DataFrame([content])
    print(df.head())
else:
    print(f"Page '{page_name}' does not exist.")

                   title                                            summary  \
0  Dunning–Kruger effect  The Dunning–Kruger effect is a cognitive bias ...   

                                        full_content  
0  The Dunning–Kruger effect is a cognitive bias ...  
