In [4]:
import pandas as pd
import requests

In [5]:
import wikipedia as wp

In [40]:
!pip install plotly

Collecting plotly
  Downloading plotly-5.5.0-py2.py3-none-any.whl (26.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.5.0 tenacity-8.0.1


## Load the data and simple cleaning

In [18]:
#Get the html source
html = wp.page("List of states and territories of the United States by population").html().encode("UTF-8")
df = pd.read_html(html)[0]

In [19]:
dat = df
#dat.state_name = dat.state_name.str.strip()
dat.columns = [' '.join(col).strip() for col in dat.columns.values]


In [25]:
dat.rename(columns = {'State or territory State or territory': 'state_name', 
                      'Census population[7][a] April 1, 2020' : 'population_2020', 
                     'Census population[7][a] April 1, 2010' : 'population_2010'}, inplace=True)

In [26]:
codes = pd.read_csv('data/state_codes.csv')
dat = dat.merge(codes, how='left')

In [27]:
dat = dat.loc[dat.state_code.isnull() == False]

In [36]:
dat['percent_2020'] = dat.population_2020 / dat.population_2020.sum()
dat['percent_2010'] = dat.population_2010 / dat.population_2010.sum()
dat['percent_increase'] = (dat.population_2020 - dat.population_2010)/dat.population_2020

## Visualization 

In [41]:
import plotly.offline as py
py.init_notebook_mode(connected=True)

This is a US map with growth per state colored

In [42]:
dat_grow = dat.loc[dat.percent_increase >-5]
#dat_grow = dat
import plotly.graph_objs as go

scl = [
    [0.0, 'rgb(242,240,247)'],
    [0.2, 'rgb(218,218,235)'],
    [0.4, 'rgb(188,189,220)'],
    [0.6, 'rgb(158,154,200)'],
    [0.8, 'rgb(117,107,177)'],
    [1.0, 'rgb(84,39,143)']
]

data = [go.Choropleth(
    colorscale = scl,
    autocolorscale = False,
    locations = dat_grow['state_code'],
    z = dat_grow.percent_increase.astype(float),
#    z = dat_grow.population_2018 - dat_grow.population_2010,
    locationmode = 'USA-states',
    text = dat_grow['state_name'],
    marker = go.choropleth.Marker(
        line = go.choropleth.marker.Line(
            color = 'rgb(255,255,255)',
            width = 2
        )),
    colorbar = go.choropleth.ColorBar(
        title = "% increase")
#        title = "Numerical increase")
)]

layout = go.Layout(
    title = go.layout.Title(
        text = 'Population Change'
    ),
    geo = go.layout.Geo(
        scope = 'usa',
        projection = go.layout.geo.Projection(type = 'albers usa'),
        showlakes = True,
        lakecolor = 'rgb(255, 255, 255)'),
)

fig3 = go.Figure(data = data, layout = layout)
py.plot(fig3, filename = 'd3-cloropleth-map.html')

'd3-cloropleth-map.html'

This is a per state breakdown of 2010 and 2020 data

In [43]:
import math
fig = {
    'data': [
  		{
  			'x': dat.state_name, 
        	'y': dat.population_2020, 
        	'text': dat.state_name, 
        	'mode': 'markers',
            'marker': {
                'color' : 'green'
            },
        	'name': 'population2018'},
  			{'x': dat.state_name, 
        	'y': dat.population_2010, 
        	'text': dat.state_name, 
        	'mode': 'markers', 
            'marker': {
                'color' : 'red'
            },
        	'name': 'population2010'}        
    ],
    'layout': {
        'xaxis': {'title': 'State name'},
        'yaxis': {'title': "population", 'range': [0, 4.2e7]}
    }
}

# IPython notebook
# py.iplot(fig, filename='pandas/multiple-scatter')

url = py.plot(fig, filename = 'population.html')

In [48]:
#import plotly.io as pio
# this doesn't work because of an install issue
#pio.write_image(fig, 'data/population.png' )