In [47]:
# Dependencies and Setup
import csv
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress 
import seaborn as sns 
from scipy import stats
import scipy.stats as sts


Alicia's analysis 

(Disclaimer: Zillow Home Value Index (ZHVI): A smoothed, seasonally adjusted measure of the typical home value and market changes across a given region and housing type. It reflects the typical value for homes in the 35th to 65th percentile range. The raw raw version of that mid-tier ZHVI time series is also available.)

* Zillow also publishes ZHVI for all single-family residences ($, typical value for all single-family homes in a given region), for condo/coops ($), for all homes with 1, 2, 3, 4 and 5+ bedrooms ($), and the ZHVI per square foot ($, typical value of all homes per square foot calculated by taking the estimated home value for each home in a given region and dividing it by the home’s square footage).


In [48]:
# Extracting the path
real_estate_path = "../Data/Housing_Price/State_Home_Price_Single_Family.csv"

In [49]:
# Read Real Estate Data from Zillow (from year 2015 to 2020)
# real_estate_data = pd.read_csv(real_estate_path)
real_estate_data = pd.read_csv(real_estate_path)
real_estate_data.head()

# original dataframe has 914 rows × 75 columns

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,1996-01-31,1996-02-29,1996-03-31,1996-04-30,1996-05-31,...,2020-03-31,2020-04-30,2020-05-31,2020-06-30,2020-07-31,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31
0,9,0,California,State,CA,161546.0,161317.0,161169.0,160880.0,160685.0,...,572347.0,575413.0,577325.0,579140.0,582941.0,588980.0,596054.0,603158.0,610386.0,617633.0
1,54,1,Texas,State,TX,96983.0,97008.0,97036.0,97161.0,97297.0,...,209515.0,210311.0,211154.0,212057.0,213125.0,214406.0,215951.0,217863.0,219982.0,222237.0
2,43,2,New York,State,NY,138426.0,138113.0,137928.0,137649.0,137546.0,...,326694.0,327749.0,328801.0,330170.0,331930.0,334164.0,336900.0,340092.0,343510.0,346973.0
3,14,3,Florida,State,FL,100264.0,100360.0,100471.0,100681.0,100882.0,...,251098.0,252395.0,253659.0,254911.0,256194.0,257962.0,260077.0,262410.0,264768.0,267296.0
4,21,4,Illinois,State,IL,141172.0,140917.0,140595.0,140218.0,139699.0,...,207548.0,207938.0,208362.0,208898.0,209820.0,211074.0,212769.0,214659.0,216645.0,218622.0


In [52]:
# Creating a list 
real_estate_date = []
column_names = ["RegionID", "SizeRank", "RegionName", "RegionType", "StateName"]
for x in real_estate_data.columns:
    if x not in column_names:
        real_estate_date.append(x)

In [63]:
real_estate_df = []
for i, row in real_estate_data.iterrows():
    region_id = row["RegionID"]
    size_rank = row["SizeRank"]
    region_name = row["RegionName"]
    region_type = row["RegionType"]
    state_name = row["StateName"]
    for x in real_estate_date:
        dat = x.split("-")
        a ={
            "Region ID": region_id,
            "Size Rank": size_rank,
            "Region Name": region_name,
            "RegionType": region_type, 
            "State Name": state_name,
            "Date": f"{dat[1].rjust(2,'0')}-{dat[2].rjust(2,'0')}-{dat[0]}",
            "Year": dat[0],
            "Month": dat[1],
            "Housing Price":row[x]
        }
        real_estate_df.append(a)
        


In [64]:
# creating the new dataframe for easy viewing 
new_RE_df = pd.DataFrame(real_estate_df)
new_RE_df

Unnamed: 0,Region ID,Size Rank,Region Name,RegionType,State Name,Date,Year,Month,Housing Price
0,9,0,California,State,CA,01-31-1996,1996,01,161546.0
1,9,0,California,State,CA,02-29-1996,1996,02,161317.0
2,9,0,California,State,CA,03-31-1996,1996,03,161169.0
3,9,0,California,State,CA,04-30-1996,1996,04,160880.0
4,9,0,California,State,CA,05-31-1996,1996,05,160685.0
...,...,...,...,...,...,...,...,...,...
15295,62,50,Wyoming,State,WY,08-31-2020,2020,08,257141.0
15296,62,50,Wyoming,State,WY,09-30-2020,2020,09,257853.0
15297,62,50,Wyoming,State,WY,10-31-2020,2020,10,258747.0
15298,62,50,Wyoming,State,WY,11-30-2020,2020,11,260035.0


In [69]:
# drop NA from the dataframe
new_RE_df = new_RE_df.dropna(how="any")
# new_RE_df.head()

# Drop duplicate 
no_duplicate_RE_df = new_RE_df.drop_duplicates()
# no_duplicate_RE_df.head()


# split city from state, make a new column with just city
no_duplicate_RE_df["Region Name"] = no_duplicate_RE_df["Region Name"].str.split(",").str[0]
# no_duplicate_RE_df.head()

# rename the Region Name to City Name
no_duplicate_RE_df = no_duplicate_RE_df.rename({"Region Name": "City Name"}, axis="columns")
# no_duplicate_RE_df

Unnamed: 0,Region ID,Size Rank,City Name,RegionType,State Name,Date,Year,Month,Housing Price
0,9,0,California,State,CA,01-31-1996,1996,01,161546.0
1,9,0,California,State,CA,02-29-1996,1996,02,161317.0
2,9,0,California,State,CA,03-31-1996,1996,03,161169.0
3,9,0,California,State,CA,04-30-1996,1996,04,160880.0
4,9,0,California,State,CA,05-31-1996,1996,05,160685.0
...,...,...,...,...,...,...,...,...,...
15295,62,50,Wyoming,State,WY,08-31-2020,2020,08,257141.0
15296,62,50,Wyoming,State,WY,09-30-2020,2020,09,257853.0
15297,62,50,Wyoming,State,WY,10-31-2020,2020,10,258747.0
15298,62,50,Wyoming,State,WY,11-30-2020,2020,11,260035.0


In [73]:
n = 132
just_date_df = no_duplicate_RE_df.iloc[:,-n:]

just_date_df

Unnamed: 0,Region ID,Size Rank,City Name,RegionType,State Name,Date,Year,Month,Housing Price
0,9,0,California,State,CA,01-31-1996,1996,01,161546.0
1,9,0,California,State,CA,02-29-1996,1996,02,161317.0
2,9,0,California,State,CA,03-31-1996,1996,03,161169.0
3,9,0,California,State,CA,04-30-1996,1996,04,160880.0
4,9,0,California,State,CA,05-31-1996,1996,05,160685.0
...,...,...,...,...,...,...,...,...,...
15295,62,50,Wyoming,State,WY,08-31-2020,2020,08,257141.0
15296,62,50,Wyoming,State,WY,09-30-2020,2020,09,257853.0
15297,62,50,Wyoming,State,WY,10-31-2020,2020,10,258747.0
15298,62,50,Wyoming,State,WY,11-30-2020,2020,11,260035.0


In [71]:
# mean, max, min
RE_groupby_year = no_duplicate_RE_df.groupby("City Name").describe()["Housing Price"]
RE_groupby_year


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
City Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama,300.0,117584.553333,16411.160276,84970.0,106239.75,118477.0,129155.75,157366.0
Alaska,300.0,229171.976667,58281.037018,123039.0,177351.25,256828.5,276518.5,308644.0
Arizona,300.0,185399.09,52895.016738,107541.0,141318.0,173061.0,228000.75,309122.0
Arkansas,300.0,102432.476667,17634.281844,70612.0,86513.75,107035.0,112755.25,141095.0
California,300.0,370375.126667,130463.013296,160421.0,267234.5,358818.5,488006.25,617633.0
Colorado,300.0,253700.92,72590.719284,139855.0,223126.25,237574.5,271083.25,436751.0
Connecticut,300.0,240860.396667,50015.892603,150063.0,209651.5,246096.5,269686.75,327203.0
Delaware,300.0,214871.046667,46211.515644,132731.0,171479.5,225505.0,251887.0,281923.0
District of Columbia,300.0,405954.616667,155530.036254,163166.0,246755.25,431759.5,533401.25,667854.0
Florida,300.0,175818.346667,52222.436689,100264.0,137171.75,167656.5,220411.5,274280.0


In [11]:
# Showing the year from 2010 Jan
n = 132
just_date_df = real_estate_data.iloc[:,-n:]
just_date_df.head()

Unnamed: 0,2010-01-31,2010-02-28,2010-03-31,2010-04-30,2010-05-31,2010-06-30,2010-07-31,2010-08-31,2010-09-30,2010-10-31,...,2020-03-31,2020-04-30,2020-05-31,2020-06-30,2020-07-31,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31
0,332582.0,332386.0,333160.0,334332.0,336046.0,335580.0,334294.0,332638.0,331331.0,329607.0,...,572347.0,575413.0,577325.0,579140.0,582941.0,588980.0,596054.0,603158.0,610386.0,617633.0
1,136812.0,136787.0,136800.0,136892.0,136918.0,136799.0,136335.0,135809.0,135281.0,134857.0,...,209515.0,210311.0,211154.0,212057.0,213125.0,214406.0,215951.0,217863.0,219982.0,222237.0
2,264463.0,264417.0,264267.0,264445.0,264524.0,264905.0,265038.0,264818.0,264212.0,263300.0,...,326694.0,327749.0,328801.0,330170.0,331930.0,334164.0,336900.0,340092.0,343510.0,346973.0
3,159166.0,158410.0,157902.0,157424.0,156885.0,156082.0,154992.0,153705.0,152347.0,150864.0,...,251098.0,252395.0,253659.0,254911.0,256194.0,257962.0,260077.0,262410.0,264768.0,267296.0
4,188510.0,188242.0,186865.0,186737.0,185588.0,185612.0,183423.0,182278.0,180464.0,179479.0,...,207548.0,207938.0,208362.0,208898.0,209820.0,211074.0,212769.0,214659.0,216645.0,218622.0


In [12]:
new_df = real_estate_data.iloc[:,2:]
new_df.tail()

Unnamed: 0,RegionName,RegionType,StateName,1996-01-31,1996-02-29,1996-03-31,1996-04-30,1996-05-31,1996-06-30,1996-07-31,...,2020-03-31,2020-04-30,2020-05-31,2020-06-30,2020-07-31,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31
46,Alaska,State,AK,123039.0,123453.0,123723.0,124385.0,124894.0,125493.0,126034.0,...,293735.0,290309.0,289323.0,288561.0,288802.0,288676.0,289437.0,292762.0,296278.0,300283.0
47,North Dakota,State,ND,,,,,,,,...,233357.0,233808.0,234145.0,234499.0,234758.0,234945.0,235215.0,235865.0,236960.0,238112.0
48,Vermont,State,VT,109623.0,109611.0,109523.0,109387.0,109201.0,108922.0,108606.0,...,262924.0,263547.0,264612.0,265671.0,266863.0,267844.0,268657.0,270307.0,272687.0,275472.0
49,District of Columbia,State,DC,169574.0,168802.0,168120.0,166953.0,166039.0,165388.0,164828.0,...,649616.0,651147.0,650328.0,650605.0,651444.0,654382.0,657785.0,660679.0,664978.0,667854.0
50,Wyoming,State,WY,,,,,,,,...,254521.0,254999.0,255371.0,255885.0,256404.0,257141.0,257853.0,258747.0,260035.0,261262.0


In [None]:
# n = 132
# just_date_V2_df = new_df.iloc[:,[2,-n:]]
# just_date_V2_df.head()
    


In [None]:
# df1 = real_estate_data[real_estate_data.columns[range 132]]
n = 132
# df1 = real_estate_data.iloc[:,list(range(2))+[-n:]]
df1 = pd.concat([real_estate_data.iloc[:,2],real_estate_data.iloc[:,-n:]],axis=1)
df1.head()

In [None]:
by_region_df = df1.groupby("RegionName")
by_region_df.first()

In [None]:
reset_by_region = by_region_df.reset_index()
reset_by_region 

In [None]:
cut_estate_df = real_estate_data[["SizeRank", "RegionName", "StateName"]]
cut_estate_df.head()

In [None]:
# pd.read_csv(real_estate_path)["SizeRank"].value_counts(ascending=True)

In [None]:
# # Creating a list 
real_estate_date = []
column_names = ["RegionName", "StateName", "SizeRank", "RegionID"]
for x in real_estate_data.columns:
    if x not in column_names:
        real_estate_date.append(x)

In [None]:
real_estate_df = []
for i, row in real_estate_data.iterrows():
    region_name = row["RegionName"]
    state_name = row["StateName"]
    size_rank = row["SizeRank"]
    region_id = row["RegionID"]
    for x in real_estate_date:
        dat = x.split("/")
        a ={
            "Region Name": region_name,
            "State Name": state_name,
            "Size Rank": size_rank,
            "Region ID": region_id,
            "Date": f"{dat[0].rjust(2, '0')}-{dat[1].rjust(2, '0')}-{dat[2]}",
            "Year": dat[2],
            "Month": dat[0],
            "Housing Price":row[x]
        }
        real_estate_df.append(a)
        


In [None]:
# creating the new dataframe for easy viewing 
new_RE_df = pd.DataFrame(real_estate_df)
new_RE_df.head()

In [None]:
# drop NA from the dataframe
new_RE_df = new_RE_df.dropna(how="any")
# new_RE_df.head()

# Drop duplicate 
no_duplicate_RE_df = new_RE_df.drop_duplicates()
# no_duplicate_RE_df.head()


# split city from state, make a new column with just city
no_duplicate_RE_df["Region Name"] = no_duplicate_RE_df["Region Name"].str.split(",").str[0]
# no_duplicate_RE_df.head()

# rename the Region Name to City Name
no_duplicate_RE_df = no_duplicate_RE_df.rename({"Region Name": "City Name"}, axis="columns")
# no_duplicate_RE_df.head()


In [None]:
# mean, max, min
RE_groupby_year = no_duplicate_RE_df.groupby("Year").describe()["Housing Price"]
RE_groupby_year


In [None]:
#create a line chart of the mean of the housing mean
# x-axis = RE_groupby_year.index.values, y-axis = RE_groupby_year["mean"] 
# the index values are the years, 2015, 2016 , etc. 
plt.plot(RE_groupby_year.index.values, RE_groupby_year["mean"])

# Add labels and title to graph
plt.xlabel("Year")
plt.ylabel("Average Housing Price")
plt.title("Change in Avg Housing Price Per Year")

# plt.xlim(-0.75, len(RE_groupby_year.index.values))
# plt.ylim(145000, max(RE_groupby_year["mean"])+50)

plt.grid()
plt.show()

# positive relationship
# increasing housing price from 2015 to 2020
# add x-lim and y- lim


In [None]:
# looking at relationship of how much the max. housing price changes over the past 5 years

plt.plot(RE_groupby_year.index.values, RE_groupby_year["max"]/1000)

# Add labels and title to graph
plt.xlabel("Year")
plt.ylabel("Max. Housing Price Price (x 1,000)")
plt.title("The changes in premium housing price per year")


plt.grid()
plt.show()


In [None]:
# RE_groupby_year["max"]/10000

In [None]:
# looking at relationship of how much the min. housing price changes over the past 5 years

plt.plot(RE_groupby_year.index.values, RE_groupby_year["min"])

# Add labels and title to graph
plt.xlabel("Year")
plt.ylabel("Min. Housing Price Price")
plt.title("The changes in more affordable housing price per year")


plt.grid()
plt.show()


In [None]:
# grouping the housing price by state
RE_groupby_state = no_duplicate_RE_df.groupby(["State Name", "Year"]).describe()["Housing Price"]
# RE_groupby_state.head()

In [None]:
# looking at the median housing price
RE_groupby_state_median = RE_groupby_state["50%"]
# RE_groupby_state_median.head()


In [None]:
state_year_min_max = no_duplicate_RE_df.groupby(["State Name", "Year", "City Name"])["Housing Price"].agg(["min", "max"])
# state_year_min_max
# state_year_median = no_duplicate_RE_df.groupby(["State Name", "Year", "City Name"])["Housing Price"].median()
# state_year_median




In [None]:
state_year_min_max_2 = state_year_min_max.reset_index()
state_year_min_max_2.sort_values(["State Name", "max"], inplace=True)
state_year_min_max_2.groupby(["State Name", "Year"]).last()

In [None]:
state_year_min_max2 = state_year_min_max.copy().reset_index()
ls = {}
for i, row in state_year_min_max2.iterrows():
    if row["State Name"] not in ls:
        ls[row["State Name"]] = [10000000000, 0, "", "", 0, 0]
    if ls[row["State Name"]][0]>row["min"]:
        ls[row["State Name"]][0] =row["min"]
        ls[row["State Name"]][2] = row["City Name"]
        ls[row["State Name"]][4] = row["Year"]
    if ls[row["State Name"]][1]<row["max"]:
        ls[row["State Name"]][1] =row["max"]
        ls[row["State Name"]][3] = row["City Name"]
        ls[row["State Name"]][5] = row["Year"]
    a= 1
print(row)
# print(ls)
max_min_state_year = pd.DataFrame(ls).T
max_min_state_year.head()

In [None]:
#renaming the columns
max_min_state_year = max_min_state_year.rename(columns={
        0: "Min. Housing Price",
        1: "Max. Housing Price",
        2: "City of Min. Housing Price",
        3: "City of Max. Housing Price",
        4: "Year of Min. Housing Price",
        5: "Year of Max. Housing Price"})

max_min_state_year.head()

In [None]:
max_min_state_year1 = max_min_state_year[max_min_state_year.index !="Country"]
# max_min_state_year1.head()

In [None]:
# Ploting the premium housing price and more affordable housing price of each state
max_housing_price_bar = max_min_state_year1.plot(kind="bar", figsize=(15,5))

plt.xlabel("State Name")
plt.ylabel("Housing Price (Million $)")
plt.title("The Min. & Max housing price of each State")


In [None]:
# Looking at the median housing price and group by state. Drop the row that said "country"
state_year_median = no_duplicate_RE_df.groupby("State Name")["Housing Price"].median()
state_year_median = state_year_median[state_year_median.index != "Country"]
# state_year_median.head()

In [None]:
# Avg housing price of each state 
state_year_median_bar = state_year_median.plot(kind="bar", figsize=(15,5))
plt.xlabel("State Name")
plt.ylabel("Housing Price")


In [None]:
homelessdata = "../Data/PIT_AllYearsData_Output.csv"
homeless_df = pd.read_csv(homelessdata)
homeless_df.head()

In [None]:
#group homeless by state
homeless_df_state = homeless_df.groupby("State").mean()
# homeless_df_state.head()

In [None]:
RE_groupby_state1 = no_duplicate_RE_df.groupby("State Name").describe()["Housing Price"]
RE_groupby_state1.head()

In [None]:
# There's a country data in the dataframe, so I drop that from my dataframe.
RE_no_country_df1 = RE_groupby_state1[RE_groupby_state1.index !="Country"]
# RE_no_country_df1


In [None]:
# merging two dataframe together, Real State without Country, and Homeless dataframe
joined_RE_HL = pd.merge(RE_no_country_df1, homeless_df_state, how='inner', left_index=True, right_index=True)
joined_RE_HL.head()


In [None]:
# creating a scatter plot
x = joined_RE_HL["mean"]
y = joined_RE_HL["Overall Homeless"]

# finding the correlation 
correlation = sts.pearsonr(x, y)
print(correlation)

print(f"The correlation between average housing price and homeless population is {round(correlation[0],2)}")


labels = joined_RE_HL.index

fig, ax = plt.subplots(1, figsize=(10, 6))
fig.suptitle('Housing Price Vs. Homeless Population (All Rank Size)')

# Create the Scatter Plot
ax.scatter(x, y,
            color="blue",    # Color of the dots
            s=100,           # Size of the dots
            alpha=0.5,       # Alpha/transparency of the dots (1 is opaque, 0 is transparent)
            linewidths=1)    # Size of edge around the dots

for x_pos, y_pos, label in zip(x, y, labels):
    ax.annotate(label,             # The label for this point
                xy=(x_pos, y_pos), # Position of the corresponding point
                xytext=(7, 0),     # Offset text by 7 points to the right
                textcoords='offset points', # tell it to use offset points
                ha='left',         # Horizontally aligned to the left
                va='center')       # Vertical alignment is centered
    
(slope, intercept, rvalue, pvalue, stderr) = linregress(x, y)
regress_values = x * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.xlabel('Average Housing Price')
plt.ylabel('Overall Homeless Population')
plt.scatter(x, y)
plt.annotate(line_eq,(400000,80000),fontsize=15,color="red")
plt.plot(x,regress_values,"r-")

# Show the plot
plt.show()

In [None]:
# using the homeless dataframe, remove NY and CA. First I removed NY
homeless_no_NY = homeless_df_state[homeless_df_state.index !="NY"]
# homeless_no_NY.head()

# Dropping CA from dataframe
homeless_no_CA_NY = homeless_no_NY[homeless_no_NY.index !="CA"]
# homeless_no_CA_NY.head()

# dropping CA from real estate dataframe
RE_no_CA = RE_no_country_df1[RE_no_country_df1.index !="CA"]
# RE_no_CA.head()

# dropping NY from real estate dataframe
RE_no_CA_NY = RE_no_CA[RE_no_CA.index !="NY"]
# # RE_no_CA_NY.head()

In [None]:
# Make a merge dataframe between real estate and homeless population
joined_no_NY_CA = pd.merge(RE_no_CA_NY, homeless_no_CA_NY, how='inner', left_index= True, right_index=True)
# joined_no_NY_CA


In [None]:
# creating a scatter plot. Plotting the avg housing price and overall homeless number
x = joined_no_NY_CA["mean"]
y = joined_no_NY_CA["Overall Homeless"]

# finding the correlation 
correlation = sts.pearsonr(x, y)
print(correlation)

print(f"The correlation between average housing price and homeless population is {round(correlation[0],2)}")


labels = joined_no_NY_CA.index

fig, ax = plt.subplots(1, figsize=(10, 6))
fig.suptitle('Avg Housing $ Vs. Homeless Population (Without NY & CA - all rank size)')

# Create the Scatter Plot
ax.scatter(x, y,
            color="blue",    # Color of the dots
            s=100,           # Size of the dots
            alpha=0.5,       # Alpha/transparency of the dots (1 is opaque, 0 is transparent)
            linewidths=1)    # Size of edge around the dots

for x_pos, y_pos, label in zip(x, y, labels):
    ax.annotate(label,             # The label for this point
                xy=(x_pos, y_pos), # Position of the corresponding point
                xytext=(7, 0),     # Offset text by 7 points to the right
                textcoords='offset points', # tell it to use offset points
                ha='left',         # Horizontally aligned to the left
                va='center')       # Vertical alignment is centered
    
(slope, intercept, rvalue, pvalue, stderr) = linregress(x, y)
regress_values = x * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.xlabel('Average Housing Price')
plt.ylabel('Overall Homeless Population')
plt.scatter(x, y)
plt.annotate(line_eq,(400000,25000),fontsize=15,color="red")
plt.plot(x,regress_values,"r-")

# Show the plot
plt.show()

In [None]:
# Extracting the path
state_population = "../Data/State_Names_Sheet3.csv"

In [None]:
# Read Real Estate Data from Zillow (from year 2015 to 2020)
state_population1 = pd.read_csv(state_population).set_index('State')
state_population1.head()

In [None]:
# homeless_df_state

In [None]:
joined_population_homeless = pd.merge(state_population1, homeless_df_state, how='inner', left_index=True, right_index=True)
joined_population_homeless.head()

In [None]:
#2017/x100
joined_population_homeless["Homelessness_rate"] = joined_population_homeless["Overall Homeless"]/joined_population_homeless["Population "]*100
# joined_population_homeless["Homelessness_rate"]



In [None]:
joined_2 = pd.merge(RE_no_country_df1, joined_population_homeless, how='inner', left_index=True, right_index=True)
joined_2.head()


In [None]:
# plot homelessness rate vs. avg pricing 

# creating a scatter plot
x = joined_2["mean"]
y = joined_2["Homelessness_rate"]

# finding the correlation 
correlation = sts.pearsonr(x, y)
print(correlation)

print(f"The correlation between average housing price and homeless rate is {round(correlation[0],2)}")


labels = joined_2.index

fig, ax = plt.subplots(1, figsize=(10, 6))
fig.suptitle('Avg Housing $ Vs. Homeless rate % (all rank size)')

# Create the Scatter Plot
ax.scatter(x, y,
            color="blue",    # Color of the dots
            s=100,           # Size of the dots
            alpha=0.5,       # Alpha/transparency of the dots (1 is opaque, 0 is transparent)
            linewidths=1)    # Size of edge around the dots

for x_pos, y_pos, label in zip(x, y, labels):
    ax.annotate(label,             # The label for this point
                xy=(x_pos, y_pos), # Position of the corresponding point
                xytext=(7, 0),     # Offset text by 7 points to the right
                textcoords='offset points', # tell it to use offset points
                ha='left',         # Horizontally aligned to the left
                va='center')       # Vertical alignment is centered
    
(slope, intercept, rvalue, pvalue, stderr) = linregress(x, y)
regress_values = x * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.xlabel('Average Housing Price')
plt.ylabel('Homelessness rate')
# plt.ylim(0, 0.6)
plt.scatter(x, y)
plt.annotate(line_eq,(400000,0.5),fontsize=15,color="red")
plt.plot(x,regress_values,"r-")

# Show the plot
plt.show()