In [1]:
%matplotlib notebook

# load dependencies
import json
import requests
from matplotlib import pyplot as plt
from scipy.stats import linregress
from pprint import pprint
import pandas as pd
import scipy.stats as st

In [2]:
# read in files
home_value = pd.read_csv('Resources/median_house_value.csv')
burglaries = pd.read_csv('Resources/burglaries_state.csv')

# merge csv files
house_df = pd.merge(burglaries, home_value, on='State', how='outer')

# check the file has been read in
house_df.head()

Unnamed: 0,State,Burglaries,Abbreviation,Average FICO Score,Median Home Value,Average Outstanding Mortgage Amount,Difference Between Value and Mortgage Balance,Unnamed: 8
0,California,152555,CA,708,550800,363891,186909,
1,Texas,113902,TX,680,199900,177924,21976,
2,Florida,63396,FL,694,237900,188223,49677,
3,North Carolina,54447,NC,694,189900,162520,27380,
4,Ohio,43894,OH,705,142600,122939,19661,


In [3]:
# clean dataframe with only the relevant data
house_burglaries = house_df[['State', 'Abbreviation', 'Median Home Value', 'Burglaries']]



# check dataframe for any errors
house_burglaries.head()

Unnamed: 0,State,Abbreviation,Median Home Value,Burglaries
0,California,CA,550800,152555
1,Texas,TX,199900,113902
2,Florida,FL,237900,63396
3,North Carolina,NC,189900,54447
4,Ohio,OH,142600,43894


In [4]:
# create scatter plot for median home value and total murders

# create dataframe for just median home value and total murders
house_b = house_burglaries[["State","Median Home Value", "Burglaries"]]
house_b  = house_b.set_index("State")

bins = (0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 450000, 500000, 550000, 600000)

# show plot
house_b.plot(kind='scatter', x='Median Home Value', y='Burglaries', color='purple')
plt.yticks()
plt.title('Median Home Value vs Burglary Counts')

# save fig as output
#plt.savefig('Image Output/house_value_burglaries_scatter')


<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Median Home Value vs Burglary Counts')

In [5]:
# plot lineregress
x_values = house_burglaries['Median Home Value']
y_values = house_burglaries['Burglaries']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, color='purple')
plt.plot(x_values,regress_values,"r-")
#plt.annotate(line_eq,(500000,1300),fontsize=15,color="red")
plt.xlabel('Median Home Value')
plt.ylabel('Total Burglaries')
plt.title('Median Home Value vs Burglary Counts')

# print r-valaue and show plot
print(f"The r-squared is: {rvalue**2}")
plt.show()

# save fig as output

plt.savefig('Image Output/house_value_burglaries_scatter')

The r-squared is: 0.014373952814003221


In [6]:
# create variables for plots

house_value = house_burglaries['Median Home Value']
total_burglaries = house_burglaries['Burglaries']
corr = st.pearsonr(house_value,total_burglaries)

In [7]:
print(f"The correlation coefficient between median home value and total murders is {round( (corr[0]),2)}")

The correlation coefficient between median home value and total murders is 0.12


In [8]:
# BONUS: Generate the correlation matrix and find the strongest positive and negative correlations

hm_corr = house_burglaries.corr()
hm_corr.unstack().sort_values()

Median Home Value  Burglaries           0.119891
Burglaries         Median Home Value    0.119891
Median Home Value  Median Home Value    1.000000
Burglaries         Burglaries           1.000000
dtype: float64

In [9]:
# correlation map for fun

plt.matshow(house_df.corr())
plt.show()

<IPython.core.display.Javascript object>