In [23]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_national_parks_of_the_United_States'
html = requests.get(url).text
soup = BeautifulSoup(html, 'lxml')

In [4]:
# table = soup.find('table',{'class': re.compile('wikitable sortable')})
table_rows = soup.find_all('tr')

In [5]:
data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

In [6]:
raw_data = pd.read_html(url, header = 0)[1]
raw_data.head()

Unnamed: 0,Name,Image,Location,Date established as park[5][10],Area (2019)[11],Recreation visitors (2018)[8],Description
0,Acadia,,Maine44°21′N 68°13′W﻿ / ﻿44.35°N 68.21°W,"February 26, 1919","49,076.63 acres (198.6 km2)",3537575,Covering most of Mount Desert Island and other...
1,American Samoa,,American Samoa14°15′S 170°41′W﻿ / ﻿14.25°S 170...,"October 31, 1988","8,256.67 acres (33.4 km2)",28626,The southernmost national park is on three Sam...
2,Arches,,Utah38°41′N 109°34′W﻿ / ﻿38.68°N 109.57°W,"November 12, 1971","76,678.98 acres (310.3 km2)",1663557,"This site features more than 2,000 natural san..."
3,Badlands,,South Dakota43°45′N 102°30′W﻿ / ﻿43.75°N 102.50°W,"November 10, 1978","242,755.94 acres (982.4 km2)",1008942,"The Badlands are a collection of buttes, pinna..."
4,Big Bend,,Texas29°15′N 103°15′W﻿ / ﻿29.25°N 103.25°W,"June 12, 1944","801,163.21 acres (3,242.2 km2)",440091,Named for the prominent bend in the Rio Grande...


In [7]:
df = raw_data.copy()

In [8]:
df.dtypes

Name                                object
Image                              float64
Location                            object
Date established as park[5][10]     object
Area (2019)[11]                     object
Recreation visitors (2018)[8]        int64
Description                         object
dtype: object

In [9]:
df.columns.values

array(['Name', 'Image', 'Location', 'Date established as park[5][10]',
       'Area (2019)[11]', 'Recreation visitors (2018)[8]', 'Description'],
      dtype=object)

In [10]:
columns = ['Name', 'Image', 'Location', 'Date established as park[5][10]', 'Area (2019)[11]', 'Recreation visitors (2018)[8]', 'Description']

In [11]:
df['Location'].dtype

dtype('O')

In [12]:
df['Location'] = df['Location'].str.replace(re.compile('\d'), '')
df['Location']

0                       Maine°′N °′W﻿ / ﻿.°N .°W
1              American Samoa°′S °′W﻿ / ﻿.°S .°W
2                        Utah°′N °′W﻿ / ﻿.°N .°W
3                South Dakota°′N °′W﻿ / ﻿.°N .°W
4                       Texas°′N °′W﻿ / ﻿.°N .°W
                         ...                    
57               South Dakota°′N °′W﻿ / ﻿.°N .°W
58                     Alaska°′N °′W﻿ / ﻿.°N .°W
59    Wyoming, Montana, Idaho°′N °′W﻿ / ﻿.°N .°W
60                 California°′N °′W﻿ / ﻿.°N .°W
61                       Utah°′N °′W﻿ / ﻿.°N .°W
Name: Location, Length: 62, dtype: object

In [13]:
df['Location'] = df['Location'].str.replace(re.compile('°', flags = re.UNICODE), '')
df['Location']

0                       Maine′N ′W﻿ / ﻿.N .W
1              American Samoa′S ′W﻿ / ﻿.S .W
2                        Utah′N ′W﻿ / ﻿.N .W
3                South Dakota′N ′W﻿ / ﻿.N .W
4                       Texas′N ′W﻿ / ﻿.N .W
                       ...                  
57               South Dakota′N ′W﻿ / ﻿.N .W
58                     Alaska′N ′W﻿ / ﻿.N .W
59    Wyoming, Montana, Idaho′N ′W﻿ / ﻿.N .W
60                 California′N ′W﻿ / ﻿.N .W
61                       Utah′N ′W﻿ / ﻿.N .W
Name: Location, Length: 62, dtype: object

In [14]:
df['Location'] = df['Location'].str.replace(re.compile('′N ′W﻿ / ﻿.N .W'), '')
df['Location']

0                             Maine
1     American Samoa′S ′W﻿ / ﻿.S .W
2                              Utah
3                      South Dakota
4                             Texas
                  ...              
57                     South Dakota
58                           Alaska
59          Wyoming, Montana, Idaho
60                       California
61                             Utah
Name: Location, Length: 62, dtype: object

In [15]:
df['Location'] = df['Location'].str.replace(re.compile('′S ′W﻿ / ﻿.S .W'), '')
df['Location']

0                       Maine
1              American Samoa
2                        Utah
3                South Dakota
4                       Texas
               ...           
57               South Dakota
58                     Alaska
59    Wyoming, Montana, Idaho
60                 California
61                       Utah
Name: Location, Length: 62, dtype: object

In [16]:
df.head()

Unnamed: 0,Name,Image,Location,Date established as park[5][10],Area (2019)[11],Recreation visitors (2018)[8],Description
0,Acadia,,Maine,"February 26, 1919","49,076.63 acres (198.6 km2)",3537575,Covering most of Mount Desert Island and other...
1,American Samoa,,American Samoa,"October 31, 1988","8,256.67 acres (33.4 km2)",28626,The southernmost national park is on three Sam...
2,Arches,,Utah,"November 12, 1971","76,678.98 acres (310.3 km2)",1663557,"This site features more than 2,000 natural san..."
3,Badlands,,South Dakota,"November 10, 1978","242,755.94 acres (982.4 km2)",1008942,"The Badlands are a collection of buttes, pinna..."
4,Big Bend,,Texas,"June 12, 1944","801,163.21 acres (3,242.2 km2)",440091,Named for the prominent bend in the Rio Grande...


In [17]:
# remove unnecessary columns('Image')
df = df.drop(['Image'], axis = 1)
df.columns.values


array(['Name', 'Location', 'Date established as park[5][10]',
       'Area (2019)[11]', 'Recreation visitors (2018)[8]', 'Description'],
      dtype=object)

In [18]:
# show new dataframe
df.head()

Unnamed: 0,Name,Location,Date established as park[5][10],Area (2019)[11],Recreation visitors (2018)[8],Description
0,Acadia,Maine,"February 26, 1919","49,076.63 acres (198.6 km2)",3537575,Covering most of Mount Desert Island and other...
1,American Samoa,American Samoa,"October 31, 1988","8,256.67 acres (33.4 km2)",28626,The southernmost national park is on three Sam...
2,Arches,Utah,"November 12, 1971","76,678.98 acres (310.3 km2)",1663557,"This site features more than 2,000 natural san..."
3,Badlands,South Dakota,"November 10, 1978","242,755.94 acres (982.4 km2)",1008942,"The Badlands are a collection of buttes, pinna..."
4,Big Bend,Texas,"June 12, 1944","801,163.21 acres (3,242.2 km2)",440091,Named for the prominent bend in the Rio Grande...


In [19]:
# rename columns
df.columns = ['name', 'location', 'date_established',
       'area', 'recreation_visitors', 'description']

In [20]:
df.head()

Unnamed: 0,name,location,date_established,area,recreation_visitors,description
0,Acadia,Maine,"February 26, 1919","49,076.63 acres (198.6 km2)",3537575,Covering most of Mount Desert Island and other...
1,American Samoa,American Samoa,"October 31, 1988","8,256.67 acres (33.4 km2)",28626,The southernmost national park is on three Sam...
2,Arches,Utah,"November 12, 1971","76,678.98 acres (310.3 km2)",1663557,"This site features more than 2,000 natural san..."
3,Badlands,South Dakota,"November 10, 1978","242,755.94 acres (982.4 km2)",1008942,"The Badlands are a collection of buttes, pinna..."
4,Big Bend,Texas,"June 12, 1944","801,163.21 acres (3,242.2 km2)",440091,Named for the prominent bend in the Rio Grande...


In [27]:
type(df['recreation_visitors'])

pandas.core.series.Series