### Exercice 2: _Extraction de Données en Ligne_
**Objectif:** Ecrivons les scripts d'extraction de données depuis différentes plateformes.

In [15]:
from urllib.request import urlopen
from requests import get
from bs4 import BeautifulSoup
from dotenv import dotenv_values
import pandas as pd
from googleapiclient.discovery import build
from googlesearch import search
import praw
import wikipediaapi

config = dotenv_values(".env")

TypeError: 'type' object is not subscriptable

1. **Amazon:** Web scraping avec BeautifulSoup: Produits, prix, avis

In [4]:
url = "https://www.amazon.com/dp/B0CP22DQQS?th=1"
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

In [5]:
title = soup.find(id="productTitle").get_text(strip=True)
price = soup.find('span', {'class': 'a-price'}).get_text(strip=True).split('$')[1]
rating = soup.find(class_="a-icon-alt").get_text()
print(f"Produit: {title}\nPrix: {price}$\nAvis: {rating}")

Produit: Marsail Ergonomic Office Chair: Office Computer Desk Chair with High Back Mesh and Adjustable Lumbar Support Rolling Work Swivel Task Chairs with Wheel 3D Armrests and Headrest
Prix: 107.78$
Avis: 4.4 out of 5 stars


 2. **Twitter:** Utilisation de l'API Twitter v2: Tweets, likes, retweets

In [6]:
x_url="https://api.twitter.com/2"

X_BEARER_TOKEN = config.get("X_BEARER_TOKEN")

headers = {
  'Authorization': f"Bearer {X_BEARER_TOKEN}"
}

params = {
    'tweet.fields': 'created_at,public_metrics',
}

if not X_BEARER_TOKEN:
   raise Exception("Bearer token non trouvé. Merci de fournir un valeur à la variable d'environnement X_BEARER_TOKEN.")

def get_tweets(username: str):
    response = get(f"{x_url}/tweets/search/recent?query=from:{username}", headers=headers, params=params)
    if response.status_code != 200:
        raise Exception(f"Erreur {response.status_code}: {response.text}")
    data = response.json()
    return data.get("data")

df = pd.DataFrame(get_tweets("DylanCalluy"))
print(df.head())

                 created_at edit_history_tweet_ids  \
0  2025-04-01T18:21:19.000Z  [1907136208083316864]   
1  2025-04-01T17:13:36.000Z  [1907119167032959004]   
2  2025-04-01T17:09:16.000Z  [1907118077122097595]   
3  2025-04-01T13:13:51.000Z  [1907058833001722033]   
4  2025-04-01T09:38:04.000Z  [1907004527099670577]   

                                      public_metrics  \
0  {'retweet_count': 0, 'reply_count': 0, 'like_c...   
1  {'retweet_count': 0, 'reply_count': 1, 'like_c...   
2  {'retweet_count': 0, 'reply_count': 0, 'like_c...   
3  {'retweet_count': 10, 'reply_count': 1, 'like_...   
4  {'retweet_count': 0, 'reply_count': 0, 'like_c...   

                                                text                   id  
0                       @ryanadrift It’s crying haha  1907136208083316864  
1  I wish that I had this reach split across all ...  1907119167032959004  
2                           @Farjads_Shots Thanks 🙏🏽  1907118077122097595  
3  Early mornings in Tokyo 🇯🇵 http

 3. **Instagram:** API Instagram Graph: Captions, likes, images

In [5]:
INSTAGRAM_ACCESS_TOKEN = config.get("INSTAGRAM_ACCESS_TOKEN")
if not INSTAGRAM_ACCESS_TOKEN:
    raise Exception("Access token not found. Please provide a value for the environment variable INSTAGRAM_ACCESS_TOKEN.")

def get_instagram_posts(user_id, limit=10):
    url = f"https://graph.instagram.com/{user_id}/media"
    params = {
        'fields': 'id,caption,media_url,like_count',
        'access_token': INSTAGRAM_ACCESS_TOKEN,
        'limit': limit
    }
    response = get(url, params=params)
    if response.status_code != 200:
        raise Exception(f"Error {response.status_code}: {response.text}")
    data = response.json()
    posts = []
    for post in data['data']:
        post_data = {
            'caption': post.get('caption'),
            'media_url': post.get('media_url'),
            'like_count': post.get('like_count')
        }
        posts.append(post_data)
    return posts

user_id = "1090336554"
posts = get_instagram_posts(user_id)
df = pd.DataFrame(posts)
print(df.head())

Exception: Access token not found. Please provide a value for the environment variable INSTAGRAM_ACCESS_TOKEN.

4. **YouTube:** API YouTube Data: Titres, vues, commentaires

In [7]:
YOUTUBE_API_KEY = config.get("YOUTUBE_API_KEY")
if not YOUTUBE_API_KEY:
   raise Exception("API Key non trouvé. Merci de fournir un valeur à la variable d'environnement YOUTUBE_API_KEY.")

youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

def get_videos(channel_id: str):
    request = youtube.search().list(part="snippet", channelId=channel_id, maxResults=10)
    response = request.execute()
    videos = []
    for item in response['items']:
        if item['id']['kind'] == 'youtube#video':
            video_id = item['id']['videoId']
            video_details = youtube.videos().list(part="snippet,statistics", id=video_id).execute()
            video_info = video_details['items'][0]
            comments_request = youtube.commentThreads().list(part="snippet", videoId=video_id, maxResults=10)
            comments_response = comments_request.execute()
            comments = [comment['snippet']['topLevelComment']['snippet']['textDisplay'] for comment in comments_response['items']]
            video_data = {
                'title': video_info['snippet']['title'],
                'views': video_info['statistics']['viewCount'],
                'comments': comments
            }
            videos.append(video_data)
    return videos

videos = get_videos("UCWedHS9qKebauVIK2J7383g")
df = pd.DataFrame(videos)
print(df.head())

                                               title    views  \
0  Vidéo complète sur la chaîne ! Backstage de no...    13883   
1  Vidéo complète sur la chaîne ! Q&A exclusive :...  1709213   
2            Le rôle abject d’un flic sur le Darkweb   825636   
3  Le rôle abject d'un garde d'Obama sur le dark web   417989   
4  La trouvaille scandaleuse d'un hacker sur un d...   383774   

                                            comments  
0  [Le futur hugo décrypte ? 😉😉, il a un appareil...  
1  [Je trouve qu il lui fait des yeux doux😂, On e...  
2  [Christophe coulons, Il a décroché le job car ...  
3  [C&#39;est débile..<br><br>Pourquoi le mec est...  
4  [Depuis quand tu prends autant de drogue d’un ...  


 5. **Google Search:** Scraper avec googlesearch (Résultats de recherche)

In [8]:
def google_search(query, num_results=10):
    search_results = []
    for result in search(query, num_results=num_results):
        search_results.append(result)
    return search_results

query = 'Data Science Course site:*.edu filetype:pdf intext:"Book"'
results = google_search(query)
df = pd.DataFrame(results)
print(df.head())

                                                   0
0            https://www.cs.cornell.edu/jeh/book.pdf
1  https://www.webpages.uidaho.edu/~stevel/517/Th...
2  https://digital.library.ncat.edu/cgi/viewconte...
3  https://cims.nyu.edu/~cfgranda/pages/stuff/pro...
4  https://www.cs.umd.edu/class/fall2018/cmsc641/...


6. **Reddit:** API Reddit (PRAW). (Posts, votes, commentaires)

In [9]:
reddit = praw.Reddit(
    client_id=config.get("REDDIT_CLIENT_ID"),
    client_secret=config.get("REDDIT_CLIENT_SECRET"),
    user_agent=config.get("REDDIT_USER_AGENT")
)

def get_reddit_posts(subreddit_name, limit=10):
    subreddit = reddit.subreddit(subreddit_name)
    posts = []
    for post in subreddit.hot(limit=limit):
        post_data = {
            'title': post.title,
            'score': post.score,
            'num_comments': post.num_comments,
            'comments': [comment.body for comment in post.comments[:10]]
        }
        posts.append(post_data)
    return posts

subreddit_name = "learnpython"
posts = get_reddit_posts(subreddit_name)
df = pd.DataFrame(posts)
print(df.head())

                                               title  score  num_comments  \
0                Ask Anything Monday - Weekly Thread      4            11   
1  Can I use Qt with Python (PyQt) for a non-comm...      7             7   
2               Should I refer to a book or a course      6             6   
3  How to dynamically set logging level in this e...      6             2   
4         Best file format for external data storage      3             4   

                                            comments  
0  [Looking for some places to get started. I hav...  
1  [The licensing is a bit iffy. I would recommen...  
2  [There are only 2 choices.\n\nJust choose one....  
3  [for starters, you do not need the `log_levels...  
4  [All of those options are fine. CSV is probabl...  


7. **Wikipedia:** API Wikipédia (Contenu d'articles).

In [14]:
def get_wikipedia_content(page_name):
    wiki_wiki = wikipediaapi.Wikipedia(user_agent='data_science_exo')
    page = wiki_wiki.page(page_name)
    if page.exists():
        return {
            'title': page.title,
            'summary': page.summary,
            'full_content': page.text
        }
    else:
        return None

page_name = "Dunning–Kruger_effect"
content = get_wikipedia_content(page_name)
if content:
    df = pd.DataFrame([content])
    print(df.head())
else:
    print(f"Page '{page_name}' does not exist.")

                   title                                            summary  \
0  Dunning–Kruger effect  The Dunning–Kruger effect is a cognitive bias ...   

                                        full_content  
0  The Dunning–Kruger effect is a cognitive bias ...  
