In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import nltk
import warnings
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [4]:
# We are going to scrape data from worldinparis.com about upcomingh strikes in transport
import requests
from bs4 import BeautifulSoup

url = 'https://worldinparis.com/transport-in-france-strike-news-tips-for-traveling-to-paris'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

# View object
soup

<!DOCTYPE html>

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots">
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<!-- This site is optimized with the Yoast SEO plugin v20.8 - https://yoast.com/wordpress/plugins/seo/ -->
<title>Upcoming Transportation Strikes in France 2023: Info &amp; Best Tips | World In Paris</title>
<meta content="All about Paris Strikes June 2023 &amp; France Strikes June 2023. Best tips for traveling during transportation strikes in Paris &amp; Strikes in France." name="description">
<link href="https://worldinparis.com/transport-in-france-strike-news-tips-for-traveling-to-paris" rel="canonical"/>
<meta content="en_US" property="og:locale"/>
<meta content="article" property="og:type"/>
<meta content="Upcoming Transportation Strikes in France 2023: Info &amp; Best Tips | World In Paris" property="og:title"/>
<meta content="All about P

In [5]:
# View the title of the webpage
soup.title

<title>Upcoming Transportation Strikes in France 2023: Info &amp; Best Tips | World In Paris</title>

In [6]:
# View the text of the webpage
soup.get_text()



'\n\n\n\n\n\n\nUpcoming Transportation Strikes in France 2023: Info & Best Tips | World In Paris\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to content \n\n\n\n\n\n \n\nTransportation Strikes 2023\n \n\n\n\n\n\nMenu \nAREAS\nSEE & DO\n\nParis Guide\nTop Things to Do in Paris\nItineraries & Walks\nParis Attractions\nHidden Gems\nVersailles\nDisneyland Paris\nBest Day Trips\n\n\nFOOD & WINE\n\nParis Food Guide\nFrench Wine\nEat & Drink\nRestaurants\n\n\nPARIS STORIES\nPLAN\n\nParis Trip Planner\nBest Time to Visit Paris\nParis by Month\nParis Checklist\nPacking Lists\nBudget Calculator\nPrepaid SIM Cards\nFREE Paris Planning Challenge\n\n\nSTAY\nTIPS & TRICKS\n\nSkip the Line in Paris\nParis Pass Comparison\nParis Deals\nMetro Guide\nTransportation Strikes 2023\n\n\nFRANCE\n \n\n\n\n\n\n\n\n\nUpcoming Transportation Strikes in France 2023: Info & Best Tips \n\n\n1. French Transit Strikes &

In [7]:
# View all the links in the webpage
soup.find_all('a')

[<a class="screen-reader-text skip-link" href="#content" title="Skip to content">Skip to content</a>,
 <a href="https://worldinparis.com/" rel="home">
 <img alt="World In Paris" class="header-image is-logo-image" height="100" src="https://worldinparis.com/wp-content/uploads/2023/05/worldinparis-logo-500px.jpg" width="500"/>
 </a>,
 <a class="gb-button gb-button-8b7cac02 gb-button-text" href="https://worldinparis.com/transport-in-france-strike-news-tips-for-traveling-to-paris">Transportation Strikes 2023</a>,
 <a href="https://worldinparis.com/arrondissements-of-paris">AREAS</a>,
 <a>SEE &amp; DO<span class="dropdown-menu-toggle" role="presentation"><span class="gp-icon icon-arrow"><svg aria-hidden="true" height="1em" viewbox="0 0 330 512" width="1em" xmlns="http://www.w3.org/2000/svg"><path d="M305.913 197.085c0 2.266-1.133 4.815-2.833 6.514L171.087 335.593c-1.7 1.7-4.249 2.832-6.515 2.832s-4.815-1.133-6.515-2.832L26.064 203.599c-1.7-1.7-2.832-4.248-2.832-6.514s1.132-4.816 2.832-6.515l

In [8]:
# Get the text of the first paragraph
soup.find_all('p')[0].get_text()

'Transport in France and Paris is usually quite good, but unfortunately, sometimes it is affected by strikes.'

In [9]:
# Check the number of paragraphs in the webpage
len(soup.find_all('p'))

145

In [10]:
# Obtain the text of all the paragraphs
text = [p.get_text() for p in soup.find_all('p')]
text

['Transport in France and Paris is usually quite good, but unfortunately, sometimes it is affected by strikes.',
 'However, we locals have learned to deal with French strikes over time. How? With the right information, a little bit of flexibility in our everyday life, and lots of patience.',
 'For example, if there’s a Paris strike today, some people living on the outskirts of Paris may decide to take a day off from work or work online from the house to avoid the transportation hassle of this Paris transit strike.',
 'Last update: 04 June 2023',
 'Planning a trip to Paris soon? Go on reading this article on Transportation Strikes in France 2023, with the upcoming transportation strikes in Paris and France and our best tips to deal with them.',
 'Related content',
 'This article on transportation strikes is extensive and covers many topics. Learn what you can find in this article and what you won’t find:',
 'This article does not cover the strikes by foreign companies traveling to Franc

In [11]:
# Get the headers of the webpage
headers = [h.get_text() for h in soup.find_all('h2')]
headers

['1. French Transit Strikes & Paris Transit Strikes',
 '2. Content of this Article',
 '3.\xa0Calendar France Strikes & Paris Strikes 2023',
 '4.\xa0Traveling to France during an SNCF Train Strike in France',
 '5. Traveling to France during Airport Strikes /Air Strikes in France',
 '6. Paris Metro Strikes and Other RATP Services',
 '7. Gilets Jaunes (Yellow Vests) in Paris Strikes',
 'About WORLD IN PARIS',
 'Quirky Parisian explorers with a preference for lesser-known sights, we are continuously looking for new ideas and tips to bring you the best of the City of Light.']

In [12]:
# Get the text from each header
text = [h.get_text() for h in soup.find_all('h2')]
text

['1. French Transit Strikes & Paris Transit Strikes',
 '2. Content of this Article',
 '3.\xa0Calendar France Strikes & Paris Strikes 2023',
 '4.\xa0Traveling to France during an SNCF Train Strike in France',
 '5. Traveling to France during Airport Strikes /Air Strikes in France',
 '6. Paris Metro Strikes and Other RATP Services',
 '7. Gilets Jaunes (Yellow Vests) in Paris Strikes',
 'About WORLD IN PARIS',
 'Quirky Parisian explorers with a preference for lesser-known sights, we are continuously looking for new ideas and tips to bring you the best of the City of Light.']

In [13]:
# Get the text from each header separately
# for example, the first header
text[0]

'1. French Transit Strikes & Paris Transit Strikes'

In [15]:
# French transit strikes & Paris Transit Strikes
header_one_text = soup.find_all('h2')[0].get_text()
header_one_text

'1. French Transit Strikes & Paris Transit Strikes'

In [16]:
# Find the index of the first header
text.index(header_one_text)

0

In [19]:
# Obtain the text of all the paragraphs in the first header
text = [p.get_text() for p in soup.find_all('h2')[0].find_next_siblings('p')]
text

['Transport in France and Paris is usually quite good, but unfortunately, sometimes it is affected by strikes.',
 'However, we locals have learned to deal with French strikes over time. How? With the right information, a little bit of flexibility in our everyday life, and lots of patience.',
 'For example, if there’s a Paris strike today, some people living on the outskirts of Paris may decide to take a day off from work or work online from the house to avoid the transportation hassle of this Paris transit strike.',
 'Last update: 04 June 2023']

In [20]:
# Now we can obtain the text of all the headers and paragraphs
# We will use a for loop to do this
headers = [h.get_text() for h in soup.find_all('h2')]
headers

# Obtain the text of all the paragraphs in each header
text = []
for i in range(len(headers)):
    text.append([p.get_text() for p in soup.find_all('h2')[i].find_next_siblings('p')])
text

# Create a dataframe with the headers and paragraphs
df = pd.DataFrame({'headers': headers, 'text': text})
df

Unnamed: 0,headers,text
0,1. French Transit Strikes & Paris Transit Strikes,[Transport in France and Paris is usually quit...
1,2. Content of this Article,[This article on transportation strikes is ext...
2,3. Calendar France Strikes & Paris Strikes 2023,[]
3,4. Traveling to France during an SNCF Train St...,[]
4,5. Traveling to France during Airport Strikes ...,[]
5,6. Paris Metro Strikes and Other RATP Services,[GOOD TO KNOW: The company RATP is responsible...
6,7. Gilets Jaunes (Yellow Vests) in Paris Strikes,"[, GILETS JAUNES DEMONSTRATIONS DESCRIPTION: n..."
7,About WORLD IN PARIS,[]
8,Quirky Parisian explorers with a preference fo...,[]
