Step 1: Load in different data tools

In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick


Step 2: Load in the housing data and clean it

In [68]:
#Read the dataframe and name it housing_data
housing_data = pd.read_csv('housing_data_final.csv')

Step 3: Change 'Median Sale Price' to numeric data types and clean the data

In [69]:
#Change 'Median Sale Price' to numeric
housing_data['Median Sale Price'] = housing_data['Median Sale Price'].replace(r'[\$,K]', '', regex=True).astype(float)
housing_data['Median Sale Price'] = housing_data['Median Sale Price']

Step 4: Keep only necessary columns

In [70]:
#Drop all columns after 'Median Sale Price'
necessary_housing_data = housing_data[['Region', 'Month of Period End', 'Median Sale Price']]

Step 5: Create a new column for 'Year' and drop the 'Month of Period End' column

In [71]:
#Take the last four characters from 'Month of Period End' column and create a new column 'Year'
necessary_housing_data['Year'] = housing_data['Month of Period End'].str[-4:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  necessary_housing_data['Year'] = housing_data['Month of Period End'].str[-4:]


In [72]:
#Drop the 'Month of Period End' column
necessary_housing_data = necessary_housing_data.drop(columns=['Month of Period End'])

Step 6: Group by 'Region' and 'Year' and take the mean of 'Median Sale Price'

In [73]:
#Group by 'Region' and 'Year' and take the mean of 'Median Sale Price'
grouped_housing_data = necessary_housing_data.groupby(['Region', 'Year']).mean().reset_index()

#Multiply the 'Median Sale Price' column by 1000 to get the actual sale price
grouped_housing_data['Median Sale Price'] = grouped_housing_data['Median Sale Price'] * 1000

Step 7: Clean the 'Region' column to remove everything after the comma

In [74]:
#Take the Region column and remove everything after the comma
grouped_housing_data['Region'] = grouped_housing_data['Region'].str.split(',').str[0]

Step 8: Change Dalls to DFW in the 'Region' column

In [75]:
#Change Dallas to DFW in the 'Region' column
grouped_housing_data['Region'] = grouped_housing_data['Region'].replace('Dallas', 'DFW')

Step 9: Change Washington in Region column to WashingtonDC

In [76]:
#Change Washington to WashingtonDC in the 'Region' column
grouped_housing_data['Region'] = grouped_housing_data['Region'].replace('Washington', 'WashingtonDC')

Step 10: Change 'Median Sale Price' to integer data type

In [77]:
#Convert 'Median Sale Price' to integer data type.
grouped_housing_data['Median Sale Price'] = grouped_housing_data['Median Sale Price'].astype(int)

Step 11: Change add underscores to 'Median Sale Price' column name

In [78]:
#Add underscores to 'Median Sale Price' column name
necessary_housing_data = necessary_housing_data.rename(columns={'Median Sale Price': 'Median_Sale_Price'})

In [80]:
#Give me the info of the grouped_housing_data dataframe
grouped_housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Region             292 non-null    object
 1   Year               292 non-null    object
 2   Median Sale Price  292 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 7.0+ KB


In [81]:
#export the new grouped_housing_data dataframe to a csv file
grouped_housing_data.to_csv('grouped_housing_data.csv', index=False)