In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import plotly.express as px

We compiled the US States (and Washington DC) tax rates into one dataset from 2000-2023 based on the available information from the US Government. Then we added added it as a pandas DataFrame and rounded the numbers. 

In [28]:
df = pd.read_csv("taxrates.csv")
df = df.rename(columns={"State":"Name", "STUSPS": "State"}).round(2)
df

Unnamed: 0,Name,State,2000,2001,2002,2003,2004,2005,2006,2007,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Alabama,AL,5.0,5.0,6.0,5.0,6.5,6.5,6.5,6.5,...,6.5,6.5,6.5,6.5,6.5,6.5,6.5,6.5,6.5,6.5
1,Alaska,AK,5.2,5.2,5.44,5.44,5.44,5.44,5.44,5.44,...,5.44,5.34,5.34,5.34,5.34,5.34,5.34,5.34,5.34,5.34
2,Arizona,AZ,8.0,7.97,6.97,6.97,6.97,6.97,6.97,6.97,...,6.5,5.5,5.5,4.9,4.9,4.9,4.9,4.9,4.9,4.9
3,Arkansas,AR,3.75,3.75,3.92,3.92,3.92,3.92,3.92,3.92,...,3.92,3.92,3.92,3.92,3.92,3.92,3.92,3.87,3.65,3.55
4,California,CA,8.84,8.84,8.84,8.84,8.84,8.84,8.84,8.84,...,8.84,8.84,8.84,8.84,8.84,8.84,8.84,8.84,8.84,8.84
5,Colorado,CO,4.75,4.63,4.63,4.63,4.63,4.63,4.63,4.63,...,4.63,4.63,4.63,4.63,4.63,4.63,4.63,4.55,4.55,4.4
6,Connecticut,CT,8.5,7.5,7.5,7.5,7.5,7.5,7.5,7.5,...,9.0,9.0,9.0,9.0,8.25,7.5,7.5,7.5,7.5,7.5
7,Delaware,DE,8.7,8.7,8.7,8.7,8.7,8.7,8.7,8.7,...,8.7,8.7,8.7,8.7,8.7,8.7,8.7,8.7,8.7,8.7
8,Florida,FL,5.5,5.5,5.5,5.5,5.5,5.5,5.5,5.5,...,5.5,5.5,5.5,5.5,5.5,5.5,4.46,4.46,5.5,5.5
9,Georgia,GA,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,6.0,5.75,5.75,5.75,5.75,5.75


In [22]:
df = df.sort_values(by=['Name'])
df = df.reset_index(drop= True)

Let us create another dataframe that calculates the changes in the tax rate from 2005-2023

In [None]:
df_change = df[["NAME", "STATE"]]
df_change.head()
df_change["change"] = df["2023"] - df["2005"]
df_change.head()

To get an idea of the changes in the tax rates of each state, we transpose the data and plot a line graph using matplotlib.

In [None]:
df_t = df.T 
df_t.columns=df_t.iloc[0]
df_t = df_t.tail(-2)
df_t.head()


In [None]:
# plot the data
ax = df_t.plot(figsize=(40,15))

# set x-axis label and values
ax.set_xlabel("Time")
ax.set_ylabel("Tax rate(%)")

plt.show()

In [None]:
shapefile = 's_08mr23/s_08mr23.shp'
gdf = gpd.read_file(shapefile)
gdf

In [None]:
#drop Puerto rico, Fed States of Micronesia, American Samoa, US Virgin Islands, Guam, Commonwealth of the northern mariana islands from file
gdf = gdf.drop(index=[2,37,44,52,54,55,56,57])
gdf = gdf.reset_index(drop=True)

In [None]:
len(gdf)

Creating the merged dataset that will include the Tax rate as a column

In [None]:
df_test = df_change[['STATE','change']].copy()
df_test.head()

In [None]:
gdf.plot()

We are going to clip Hawaii and Alaska and add them separately. 

In [None]:
alaska_gdf = gdf[gdf.STATE=='AK']
alaska_gdf = alaska_gdf.merge(df_test, on="STATE")
#clipping some of the western islands to make it fit better
polygon = Polygon([(-170,50),(-170,72),(-140, 72),(-140,50)])
alaska_gdf = alaska_gdf.clip(polygon)
alaska_gdf
alaska_gdf.plot()

In [None]:
polygon = Polygon([(-170,50),(-170,72),(-140, 72),(-140,40)])
gdf_tester = gdf.clip(polygon)
gdf.plot()

In [None]:
hawaii_gdf = gdf[gdf.STATE=='HI']
hawaii_gdf = hawaii_gdf.merge(df_test, on="STATE")
hawaii_gdf

Removing Alaska and Hawaii from the table due to mapping errors. Will add back later separately.

In [None]:
gdf_test = gdf.drop(gdf[gdf['STATE'] == 'AK'].index)
gdf_test = gdf_test.drop(gdf_test[gdf_test['STATE'] == 'HI'].index)
len(gdf_test)

In [None]:
from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar

In [None]:
gdf_new = gdf_test.merge(df_test, on="STATE")
gdf_new

Plotting the values on a geographical map we get a heatmap:

In [None]:
# Print the map
# Set the range for the choropleth
title = 'Changes in the corporate state tax from 2005-2023'
col = 'change'
vmin = gdf_new[col].min()
vmax = gdf_new[col].max()
cmap = 'RdYlGn_r'
# Create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(20, 8))
# Remove the axis
ax.axis('off')
gdf_new.plot(column=col, ax=ax, edgecolor='0.8', linewidth=0.5, cmap=cmap, legend=True)
# Add a title
ax.set_title(title, fontdict={'fontname': 'Times New Roman','fontsize': '25', 'fontweight': '3'})
# Create an annotation for the data source

#Add Alaska
akax = fig.add_axes([0.1, 0.17, 0.2, 0.19])   
akax.axis('off')
alaska_gdf.plot(column=alaska_gdf["change"], cmap="RdYlGn_r", ax=akax, linewidth=0.5, edgecolor='0.8')

#Add Hawaii
hiax = fig.add_axes([.28, 0.20, 0.1, 0.1])      
hiax.axis('off')
hawaii_gdf.plot(column=hawaii_gdf["change"], cmap="RdYlGn_r", ax=hiax, linewidth=0.5, edgecolor='0.8')



#### NOtes: 
1. if the state is 0- it should be white indicating no change. Currently it is confusing since its a redder hue
2. 

In [None]:
df_change

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Sample data
years = [2010, 2011, 2012, 2013, 2014, 2015, 2016]
business_applications = [100, 120, 140, 160, 180, 200, 220]
tax_rates = [10, 9, 8, 7, 6, 5, 4]

# Create a 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot the data
ax.scatter(years, business_applications, tax_rates, c='b', marker='o')

# Set labels for the axes
ax.set_xlabel('Year')
ax.set_ylabel('Number of Business Applications')
ax.set_zlabel('Tax Rate')

plt.title('3D Graph: Business Applications vs. Year vs. Tax Rate')

plt.show()





In [None]:
import plotly.express as px

# Sample data
years = [2010, 2011, 2012, 2013, 2014, 2015, 2016]
business_applications = [100, 120, 140, 160, 180, 200, 220]
tax_rates = [10, 9, 8, 7, 6, 5, 4]

# Create a DataFrame with the data
import pandas as pd
df = pd.DataFrame({'Year': years, 'Business Applications': business_applications, 'Tax Rate': tax_rates})

# Create an interactive 3D scatter plot
fig = px.scatter_3d(df, x='Year', y='Business Applications', z='Tax Rate', title='3D Scatter Plot')

# Show the plot
fig.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Sample data
years = [2010, 2011, 2012, 2013, 2014, 2015, 2016]
number_of_businesses = [100, 120, 140, 160, 180, 200, 220]
tax_rate = [10, 9, 8, 7, 6, 5, 4]

# Create a DataFrame
df = pd.DataFrame({'Year': years, 'Number of Businesses': number_of_businesses, 'Tax Rate': tax_rate})

# Calculate the correlation
correlation = df['Number of Businesses'].corr(df['Tax Rate'])

# Create the bubble plot
plt.figure(figsize=(10, 6))
plt.scatter(df['Number of Businesses'], df['Tax Rate'], s=correlation * 1000, alpha=0.6, c='blue', label='Correlation Bubble Plot')
plt.title('Bubble Plot: Number of Businesses vs. Tax Rate')
plt.xlabel('Number of Businesses')
plt.ylabel('Tax Rate')

# Annotate the correlation value
plt.annotate(f'Correlation: {correlation:.2f}', xy=(150, 7.5), fontsize=12, color='red')

plt.show()


#### Idea

ANOUSHKA'S PENDING WORK: 

Bubble chart - 
<br>size of bubble - GDP or population of the state
<br>x axis is tax rate
<br>y axis is # of businesses 

ALEX: (Graph 1)
First chart - 
<br>change the absolute values to share of # of business applications and see change in the shares for top 5 states
<br>keep existing charts as well so we can see which is giving better result

In [None]:
df.head()

In [None]:
pip install plotly==5.18.0

In [None]:
import plotly.express as px

df = px.data.gapminder()

fig = px.scatter(df.query("year==2007"), x="gdpPercap", y="lifeExp",
	         size="pop", color="continent",
                 hover_name="country", log_x=True, size_max=60)
fig.show()
df.head()

In [None]:
df.head()

for bubble chart, i would need a new df that is state, tax rate, population, # of business applications where 
x= tax rate
y = # of business registrations 
size = population 
color = 1 color or can decide 
download population 

make another that is all change - 2005-2021 

In [46]:
df_temp = df[['Name', 'State', '2022']]
df_temp.head()

Unnamed: 0,Name,State,2022
0,Alabama,AL,6.5
1,Alaska,AK,5.34
2,Arizona,AZ,4.9
3,Arkansas,AR,3.65
4,California,CA,8.84


In [59]:
df_biz = pd.read_csv('biz_apps_new.csv')

# Drop columns excluded from analyses (2022)
# These years were selected to take different datasets into account
df_biz.dtypes

Name     object
State    object
2005      int64
2006      int64
2007      int64
2008      int64
2009      int64
2010      int64
2011      int64
2012      int64
2013      int64
2014      int64
2015      int64
2016      int64
2017      int64
2018      int64
2019      int64
2020      int64
2021      int64
2022      int64
dtype: object

In [58]:
df_temp = df_temp.rename(columns={"2022":"tax_rate"})
df_temp["business_apps"] = df_biz['2022']
df_temp.head()

Unnamed: 0,Name,State,tax_rate,business_apps,pop
0,Alabama,AL,6.5,70750,5074296
1,Alaska,AK,5.34,8456,733583
2,Arizona,AZ,4.9,115080,7359197
3,Arkansas,AR,3.65,37562,3045637
4,California,CA,8.84,485954,39029342


In [64]:
df_pop = pd.read_csv("NST-EST2022-POP.csv")
df_pop = df_pop.drop(index=0)
df_pop = df_pop.reset_index()
df_pop.head()

Unnamed: 0,index,State,2005,2020,2021,2022
0,1,.Alabama,4569805,5031362,5049846,5074296
1,2,.Alaska,666946,732923,734182,733583
2,3,.Arizona,5839077,7179943,7264877,7359197
3,4,.Arkansas,2781097,3014195,3028122,3045637
4,5,.California,35827943,39501653,39142991,39029342


In [50]:
df_temp["pop"] = df_pop['2022']

In [51]:
df_temp['business_apps'] = df_temp['business_apps'].astype(int) 

In [52]:
df_temp.dtypes

Name              object
State             object
tax_rate         float64
business_apps      int64
pop                int64
dtype: object

In [53]:
fig = px.scatter(df_temp, x="tax_rate", y="business_apps", size="pop", color="Name", size_max=60,
                 title="State-wise business applications and tax rates in 2021", 
                 labels={
                     "tax_rate": "Tax Rate",
                     "business_apps": "Business Apps",
                     "Name": "States"})

fig.show()

In [55]:
#creating a bubble chart to show change- 

df_change = df[["Name", "State"]]
df_change.head()
df_change["change"] = df["2022"] - df["2005"]
df_change.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Name,State,change
0,Alabama,AL,0.0
1,Alaska,AK,-0.1
2,Arizona,AZ,-2.07
3,Arkansas,AR,-0.27
4,California,CA,0.0


In [65]:
df_change = df_change.rename(columns={"change":"tax_rate"})
df_change["business_apps"] = df_biz['2022'] - df_biz['2005']
df_change["pop"] = df_pop['2022'] - df_pop['2005']
df_change.head()

Unnamed: 0,Name,State,tax_rate,business_apps,pop
0,Alabama,AL,0.0,39672,504491
1,Alaska,AK,-0.1,3726,66637
2,Arizona,AZ,-2.07,58533,1520120
3,Arkansas,AR,-0.27,18429,264540
4,California,CA,0.0,214318,3201399


In [74]:
df_change['pop']

0      504491
1       66637
2     1520120
3      264540
4     3201399
5     1208038
6      119249
7      173246
8      104667
9     4402785
10    1986954
11     147467
12     510792
13     -27871
14     554421
15     236063
16     191851
17     329568
18      13613
19      66553
20     572281
21     578684
22     -17024
23     597586
24      34114
25     387657
26     182765
27     206426
28     745629
29      96739
30     609725
31     181070
32     544541
33    1993566
34     133172
35     292738
36     471203
37     626935
38     522018
39      25818
40    1012484
41     134331
42    1060282
43    7251449
44     923081
45      25849
46    1106514
47    1528481
48     -45336
49     346373
50      67224
Name: pop, dtype: int64

In [70]:
df_change.dtypes

Name              object
State             object
tax_rate         float64
business_apps      int64
pop                int64
dtype: object

In [72]:
fig = px.scatter(df_change, x="tax_rate", y="business_apps", size="pop", color="Name", size_max=60,
                 title="State-wise business applications and tax rates in 2021", 
                 labels={
                     "tax_rate": "Tax Rate",
                     "business_apps": "Business Apps",
                     "Name": "States"})

fig.show()

ValueError: 
    Invalid element(s) received for the 'size' property of scatter.marker
        Invalid elements include: [-27871]

    The 'size' property is a number and may be specified as:
      - An int or float in the interval [0, inf]
      - A tuple, list, or one-dimensional numpy array of the above

In [75]:
df_gdp = pd.read_csv("GDP_bystate.csv")
df_gdp.head()

Unnamed: 0,State,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Alabama,158846.8,166469.0,172975.2,174526.3,170930.9,177249.2,183916.6,189245.5,194786.9,197406.9,202372.4,207368.4,214606.3,223859.3,231561.9,230892.1,254109.7,277817.5
1,Alaska,40356.6,45094.1,49583.7,55122.5,49957.8,53331.6,56896.3,58283.6,57247.7,56484.9,51490.9,50727.7,53301.5,54899.6,54728.2,50475.2,57349.4,63618.0
2,Arizona,227915.9,245957.0,261392.0,262926.0,246424.3,251153.0,260915.7,271440.0,278591.6,287666.6,299393.3,313081.4,332001.8,351879.5,372393.5,382072.3,420026.7,458949.8
3,Arkansas,90887.7,95875.1,98381.6,99706.8,97508.1,101486.5,105768.1,108492.1,113227.3,116139.4,117786.8,119152.4,122466.7,127535.7,131578.3,133969.1,148676.1,165220.6
4,California,1698560.4,1812210.0,1898902.0,1944695.3,1890165.9,1954092.7,2023500.0,2113096.4,2220389.9,2335286.5,2473555.9,2569634.0,2728743.1,2897200.7,3042694.1,3020173.4,3373240.7,3598102.7
