# Web Scrapping com Beautiful Soup

In [1]:
import requests
from bs4 import BeautifulSoup

page = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
seven_day = soup.find(id="seven-day-forecast")
forecast_items = seven_day.find_all(class_="tombstone-container")
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Overnight
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Overnight: Partly cloudy, with a low around 60. West wind 9 to 15 mph, with gusts as high as 20 mph. " class="forecast-icon" src="newimages/medium/nsct.png" title="Overnight: Partly cloudy, with a low around 60. West wind 9 to 15 mph, with gusts as high as 20 mph. "/>
 </p>
 <p class="short-desc">
  Partly Cloudy
 </p>
 <p class="temp temp-low">
  Low: 60 °F
 </p>
</div>


In [3]:
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()

print(period)
print(short_desc)
print(temp)

Overnight
Partly Cloudy
Low: 60 °F


In [4]:
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
print(periods)

['Overnight', 'Saturday', 'SaturdayNight', 'Sunday', 'SundayNight', 'Monday', 'MondayNight', 'Tuesday', 'TuesdayNight']


In [5]:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]
print(short_descs)
print(temps)
print(descs)

['Partly Cloudy', 'Sunny thenSunny andBreezy', 'Mostly Clearand Breezythen PartlyCloudy', 'Mostly Sunny', 'Mostly Clear', 'Sunny thenSunny andBreezy', 'Clear andBreezy thenClear', 'Sunny', 'Clear']
['Low: 60 °F', 'High: 77 °F', 'Low: 58 °F', 'High: 69 °F', 'Low: 54 °F', 'High: 71 °F', 'Low: 56 °F', 'High: 74 °F', 'Low: 55 °F']
['Overnight: Partly cloudy, with a low around 60. West wind 9 to 15 mph, with gusts as high as 20 mph. ', 'Saturday: Sunny, with a high near 77. Breezy, with a west wind 7 to 12 mph increasing to 18 to 23 mph in the afternoon. Winds could gust as high as 30 mph. ', 'Saturday Night: Increasing clouds, with a low around 58. Breezy, with a west wind 17 to 23 mph, with gusts as high as 30 mph. ', 'Sunday: Mostly sunny, with a high near 69. West southwest wind 10 to 17 mph, with gusts as high as 22 mph. ', 'Sunday Night: Mostly clear, with a low around 54. Northwest wind 9 to 17 mph, with gusts as high as 22 mph. ', 'Monday: Sunny, with a high near 71. Breezy. ', 'Mon

In [6]:
import pandas as pd
df_tempo = pd.DataFrame({
        "period": periods,
         "short_desc": short_descs,
         "temp": temps,
         "desc":descs
    })
df_tempo.head()

Unnamed: 0,period,short_desc,temp,desc
0,Overnight,Partly Cloudy,Low: 60 °F,"Overnight: Partly cloudy, with a low around 60..."
1,Saturday,Sunny thenSunny andBreezy,High: 77 °F,"Saturday: Sunny, with a high near 77. Breezy, ..."
2,SaturdayNight,Mostly Clearand Breezythen PartlyCloudy,Low: 58 °F,"Saturday Night: Increasing clouds, with a low ..."
3,Sunday,Mostly Sunny,High: 69 °F,"Sunday: Mostly sunny, with a high near 69. Wes..."
4,SundayNight,Mostly Clear,Low: 54 °F,"Sunday Night: Mostly clear, with a low around ..."


In [7]:
temp_nums = df_tempo["temp"].str.extract("(?P<temp_num>\d+)", expand=False)
df_tempo["temp_num"] = temp_nums.astype('int')
print(temp_nums)

0    60
1    77
2    58
3    69
4    54
5    71
6    56
7    74
8    55
Name: temp_num, dtype: object


In [None]:
print(df_tempo)

print(df_tempo['temp_num'].mean())

          period                                short_desc         temp  \
0          Today   Mostly Cloudythen MostlySunny andBreezy  High: 65 °F   
1        Tonight  Mostly Cloudyand Breezythen MostlyCloudy   Low: 52 °F   
2       Saturday                              Partly Sunny  High: 65 °F   
3  SaturdayNight                             Partly Cloudy   Low: 51 °F   
4         Sunday                                     Sunny  High: 65 °F   
5    SundayNight                 Clear andBreezy thenClear   Low: 50 °F   
6    MemorialDay                                     Sunny  High: 67 °F   
7    MondayNight                                     Clear   Low: 51 °F   
8        Tuesday                                     Sunny  High: 70 °F   

                                                desc  temp_num  
0  Today: Cloudy, then gradually becoming mostly ...        65  
1  Tonight: Mostly cloudy, with a low around 52. ...        52  
2  Saturday: Partly sunny, with a high near 65. S...  

In [8]:
#Colocar uma nova coluna com indicação se é de noite
is_night = df_tempo["temp"].str.contains("Low") #Noite como sendo a temperetura baixa
df_tempo["noite"] = is_night
print(df_tempo)

          period                               short_desc         temp  \
0      Overnight                            Partly Cloudy   Low: 60 °F   
1       Saturday                Sunny thenSunny andBreezy  High: 77 °F   
2  SaturdayNight  Mostly Clearand Breezythen PartlyCloudy   Low: 58 °F   
3         Sunday                             Mostly Sunny  High: 69 °F   
4    SundayNight                             Mostly Clear   Low: 54 °F   
5         Monday                Sunny thenSunny andBreezy  High: 71 °F   
6    MondayNight                Clear andBreezy thenClear   Low: 56 °F   
7        Tuesday                                    Sunny  High: 74 °F   
8   TuesdayNight                                    Clear   Low: 55 °F   

                                                desc  temp_num  noite  
0  Overnight: Partly cloudy, with a low around 60...        60   True  
1  Saturday: Sunny, with a high near 77. Breezy, ...        77  False  
2  Saturday Night: Increasing clouds, with 

In [9]:
#Guardar o dataset para um ficheiro CSV para posterior utilização
#Também pode ser guardado para uma base de dados


#Preciso autorização para guardar na drive
from google.colab import drive
drive.mount('/content/drive')

df_tempo.to_csv('/content/drive/My Drive/Colab Notebooks/MADSAD BIGDATA/PyTrigo-V2-6-WebScraping.csv')


Mounted at /content/drive


In [None]:
#gravar
#df_tempo.to_csv('PyTrigo-V2-5-WebScraping.csv')