In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

In [None]:
filepath = "Resources/final_data.csv"

In [None]:
total_df = pd.read_csv(filepath)

In [None]:
pd.set_option('display.max_columns', 50)

In [None]:
total_df

## First we calculate how much the U.S is growing as a whole with and without migration 

In [None]:
# Make a dataframe to compute the annual changes
us_totals_df = pd.DataFrame({'Year' : range(2010,2020)})

In [None]:
# Total population per year
us_total_population = total_df[['POPESTIMATE2010','POPESTIMATE2011','POPESTIMATE2012','POPESTIMATE2013','POPESTIMATE2014','POPESTIMATE2015','POPESTIMATE2016','POPESTIMATE2017','POPESTIMATE2018','POPESTIMATE2019']].sum()
total_population =  us_total_population.to_list()

In [None]:
# Create a list of the total number of houses in the U.S per year
us_houses_by_year = total_df[['HUESTIMATE2010','HUESTIMATE2011','HUESTIMATE2012','HUESTIMATE2013','HUESTIMATE2014','HUESTIMATE2015','HUESTIMATE2016','HUESTIMATE2017','HUESTIMATE2018','HUESTIMATE2019']].sum()
total_houses =  us_houses_by_year.to_list()

In [None]:
# Create a list of the total number of migrants in the U.S per year
us_migrants_by_year = total_df[['NETMIG2010','NETMIG2011','NETMIG2012','NETMIG2013','NETMIG2014','NETMIG2015','NETMIG2016','NETMIG2017','NETMIG2018','NETMIG2019']].sum()
us_migrants = us_migrants_by_year.to_list()

In [None]:
# Create a list of annual means for the whole country's housing cost per year
us_mean_house_cost_by_year = total_df[['2010 Mean Housing','2011 Mean Housing','2012 Mean Housing','2013 Mean Housing','2014 Mean Housing','2015 Mean Housing','2016 Mean Housing','2017 Mean Housing','2018 Mean Housing','2019 Mean Housing']].mean()
mean_house_value = us_mean_house_cost_by_year.to_list()

In [None]:
# Populate table with the data
us_totals_df['Total Population per Year'] = total_population
us_totals_df['Total Houses per Year'] = total_houses
us_totals_df['Total Migrants per Year'] = us_migrants
us_totals_df['Total Base Population per Year'] = us_totals_df['Total Population per Year'] - us_totals_df['Total Migrants per Year']
us_totals_df['Mean Housing Cost per Year'] = mean_house_value

In [None]:
#us_totals_df

In [None]:

us_totals_df['Houses per Capita'] = us_totals_df['Total Houses per Year']/us_totals_df['Total Population per Year']

In [None]:
#us_totals_df

In [None]:
# Using small data numbers from https://www.statista.com/statistics/183657/average-size-of-a-family-in-the-us/
us_totals_df['Average Family Size'] = [3.16, 3.18, 3.13, 3.12, 3.13, 3.14, 3.14, 3.14, 3.14, 3.14]
#us_totals_df

In [None]:
# Calculate the number of houses available for the average family using available houses per person times the number of people
us_totals_df['Houses per Family'] = us_totals_df['Houses per Capita'] * us_totals_df['Average Family Size']
us_totals_df.set_index('Year', inplace=True)
#us_totals_df

In [None]:
us_totals_df

In [None]:
# Create a new Dataframe of changes
us_changes_df = pd.DataFrame({'Year' : range(2010,2020)})
#us_changes_df

In [None]:
# Populate the changes dataframe
us_changes_df['Base Population Change'] = us_totals_df['Total Base Population per Year'].diff().tolist()
us_changes_df['Change in Houses per Year'] = us_totals_df['Total Houses per Year'].diff().tolist()
us_changes_df['Change in Houses Cost per Year'] = us_totals_df['Mean Housing Cost per Year'].diff().tolist()
us_changes_df['Change in Migration per Year'] = us_totals_df['Total Migrants per Year'].diff().tolist()
us_changes_df['Change in Houses per Capita'] = us_totals_df['Houses per Capita'].diff().tolist()

In [None]:
us_changes_df

In [None]:
us_percent_change_df = pd.DataFrame({'Year' : range(2010,2020)})

In [None]:
us_percent_change_df['Base Population Change'] = us_totals_df['Total Base Population per Year'].pct_change().tolist()
us_percent_change_df['Change in Houses per Year'] = us_totals_df['Total Houses per Year'].pct_change().tolist()
us_percent_change_df['Change in Houses Cost per Year'] = us_totals_df['Mean Housing Cost per Year'].pct_change().tolist()
us_percent_change_df['Change in Migration per Year'] = us_totals_df['Total Migrants per Year'].pct_change().tolist()
us_percent_change_df['Change in Houses per Capita'] = us_totals_df['Houses per Capita'].pct_change().tolist()

In [None]:
us_percent_change_df

## Look at the changes from year to year for each state and compare to the U.S. as a whole

In [None]:
percent_change_df = pd.DataFrame({'Year' : range(2010,2020)})

In [None]:
delta_pop = total_df[['State', 'POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014'
                     , 'POPESTIMATE2015', 'POPESTIMATE2016', 'POPESTIMATE2017', 'POPESTIMATE2018', 'POPESTIMATE2019']]
delta_pop.set_index('State', inplace=True)

In [None]:
delta_pop.T

In [None]:
for state in total_df['State']:
    percent_change_df[state] = delta_pop.T[state].pct_change().tolist()

In [None]:
percent_change_df

## Data Plots

In [None]:
us_totals_df = us_totals_df.reset_index(level=0)
us_totals_df

In [None]:
#us_totals_df
us_totals = us_totals_df[["Year","Total Base Population per Year","Total Migrants per Year"]]
us_totals = us_totals.set_index("Year")
us_totals

In [None]:
#Bar graph
#How much of population growth is from migration
us_totals.plot(kind="line",stacked=True)

# Set a title for the chart
plt.title("Total Popluation vs Total Migrations per year ")
plt.tight_layout()
plt.show()

In [None]:
#Total Popluation Vs Total Migrations per State

In [None]:
Us_State= total_df[['State','POPESTIMATE2010','POPESTIMATE2011','POPESTIMATE2012','POPESTIMATE2013','POPESTIMATE2014','POPESTIMATE2015','POPESTIMATE2016','POPESTIMATE2017','POPESTIMATE2018','POPESTIMATE2019','NETMIG2010','NETMIG2011','NETMIG2012','NETMIG2013','NETMIG2014','NETMIG2015','NETMIG2016','NETMIG2017','NETMIG2018','NETMIG2019','HUESTIMATE2010','HUESTIMATE2011','HUESTIMATE2012','HUESTIMATE2013','HUESTIMATE2014','HUESTIMATE2015','HUESTIMATE2016','HUESTIMATE2017','HUESTIMATE2018','HUESTIMATE2019']]
Us_State = Us_State.set_index("State")
Pop_list = ['POPESTIMATE2010','POPESTIMATE2011','POPESTIMATE2012','POPESTIMATE2013','POPESTIMATE2014','POPESTIMATE2015','POPESTIMATE2016','POPESTIMATE2017','POPESTIMATE2018','POPESTIMATE2019']
Mig_list = ['NETMIG2010','NETMIG2011','NETMIG2012','NETMIG2013','NETMIG2014','NETMIG2015','NETMIG2016','NETMIG2017','NETMIG2018','NETMIG2019']
House_list = ['HUESTIMATE2010','HUESTIMATE2011','HUESTIMATE2012','HUESTIMATE2013','HUESTIMATE2014','HUESTIMATE2015','HUESTIMATE2016','HUESTIMATE2017','HUESTIMATE2018','HUESTIMATE2019']
Us_State['Total_Population_Per_State']=Us_State[Pop_list].sum(axis=1)
Us_State['Total_Migration_Per_State']=Us_State[Mig_list].sum(axis=1)
Us_State['Mean_House_Cost_Per_State']=Us_State[House_list].mean(axis=1)


In [None]:
Us_State = Us_State.reset_index(level=0)

In [None]:
us_states_totals = Us_State[["State","Total_Population_Per_State","Total_Migration_Per_State"]]
us_states_totals = us_states_totals.set_index("State")

In [None]:
#Bar graph
#per state total poplution and Migrtaion
us_states_totals.plot(kind="bar", figsize=(20,5))

# Set a title for the chart
plt.title("Total Popluation and Total Migrations per State ")
plt.tight_layout()
plt.show()

In [None]:
Us_State = Us_State.reset_index(level=0)

In [None]:
us_states_house_costs = Us_State[["State","Total_Migration_Per_State","Mean_House_Cost_Per_State"]]
us_states_house_costs = us_states_house_costs.set_index("State")

In [None]:
#Bar graph
#House costs and Migrtaion per state
us_states_house_costs.plot(kind="bar", figsize=(20,5))

# Set a title for the chart
plt.title("House Costs vs Migrations per State ")
plt.tight_layout()
plt.show()

In [None]:
us_totals = us_totals_df[["Year","Total Migrants per Year","Mean Housing Cost per Year"]]
us_totals = us_totals.set_index("Year")

In [None]:
#Bar graph
us_totals.plot(kind="line",stacked=True)

# Set a title for the chart
plt.title("Total Migrations per year Vs  Housing Costs per year ")
plt.tight_layout()
plt.show()

In [None]:
us_changes_df
us_changes_df = us_changes_df.fillna(0)

In [None]:
# House Costs per year vs Migrations per year
House_Costs_per_year = us_changes_df["Change in Houses Cost per Year"]
Migration_per_year = us_changes_df["Change in Migration per Year"]

In [None]:
#Scatter Plot and label's
plt.scatter(House_Costs_per_year, Migration_per_year,  marker="o", facecolors="red", edgecolors="black")
plt.title("House Costs per year vs Migrations per year")
plt.xlabel("Change in Houses Cost per Year")
plt.ylabel("Change in Migration per Year")
#plt.savefig("../Images/Scattereplot.png")
plt.show()

In [None]:
# Calculate the correlation coefficient and linear regression model 

#correlation
correlation = stats.pearsonr(House_Costs_per_year,Migration_per_year)
print(f"The correlation between both factors is {round(correlation[0],2)}")

In [None]:
#Linear regression
(slope, intercept, rvalue, pvalue, stderr) = stats.linregress(House_Costs_per_year, Migration_per_year)
regress_values = House_Costs_per_year * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

In [None]:
#Scatter plot and label's 
plt.scatter(House_Costs_per_year,Migration_per_year)
plt.plot(House_Costs_per_year,regress_values,"r-")
plt.annotate(line_eq,(0,500000),fontsize=15,color="red")
plt.xlabel("Change in Houses Cost per Year")
plt.ylabel("Change in Migration per Year")
#plt.savefig("../Images/RegressionScattereplot.png")
plt.show()