### Importing libs

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import hvplot.pandas
import plotly.express as px


## First df
#### This dataset record real estate sale from 2001 to 2020 in 170 town in the US

In [None]:

df = Path('csvfiles/re_sale.csv')
df = pd.read_csv(df,index_col='Date Recorded', infer_datetime_format=True, parse_dates=True)
df.sort_index(inplace=True)

display(df)
df.columns

In [None]:
california_df = df.fillna('Unkown')
california_df = df.drop(columns=['Address','Serial Number', 'Assessed Value', 'Sales Ratio','Property Type',])
california_df = california_df.replace(np.nan, 'Unknown')


display(california_df.head(10))
display(california_df.tail(10))
#california_df.columns

## Second df
#### This datacontains data on City of Hartford real estate sales for the last two years, with comprehensive records including property ID, parcel ID, sale date, sale price and more. This dataset is continuously updated each night and sourced from an official reliable source.

In [None]:
df2_path = Path('csvfiles/real-estate-sales-730-days-1.csv')
df2 = pd.read_csv(df2_path,index_col='SaleDate', infer_datetime_format=True, parse_dates=True)
df2.sort_index(inplace=True)
df2= df2.replace(np.nan, 'Unknown')
display(df2.head(10))



df2_cleaned = df2 [['OwnerFirstName','OwnerLastName','StreetNameAndWay','PrimaryGrantor', 'SalePrice', 'LegalReference','ParcelID']]

df2_final = df2_cleaned.drop(columns=['OwnerFirstName','OwnerLastName','StreetNameAndWay','PrimaryGrantor','SalePrice','LegalReference'])
df2_final
display(df2_final.head(10))
display(df2_final.tail(10))
df2_final.columns

### Third df
#### This table contains property sales information including sale date, price, and amounts for properties within Fairfax County

In [None]:
df3_path = Path('csvfiles/Tax_Administration_s_Real_Estate_-_Sales_Data_cutdown.csv')
#df3 = pd.read_csv(df3_path)
df3 = pd.read_csv(df3_path, index_col='SALEDT', infer_datetime_format=True, parse_dates=True)
df3.sort_index(inplace=True)
df3 = df3.replace(np.nan, 'Uknown')
display(df3)

df3.columns

In [None]:
df3_cleaned = df3.drop(columns=['TAXYR','OBJECTID','PRICE','Unnamed: 0'])
df3_cleaned = df3_cleaned.rename(columns={
    'PARID' : 'ParcelID'
})
#df3_combine['SaleDate'] = pd.to_datetime (df3_combine['SaleDate'])


In [None]:
df3_cleaned.index.names = ['SaleDate']
df3_combine = df3_cleaned.query('index > "2014-12-31"')
display(df3_combine.head(10))
display(df3_combine.tail(10))
df3_combine.dtypes


### Combined DF

In [None]:
joined_df_friction = pd.merge(df2_final,
                               df3_combine, 
                               left_index = True, 
                               right_index = True,
                               how = "outer"
)
display(joined_df_friction)

### 2020 Slice data

In [None]:
sale_date_df = joined_df_friction.drop(columns=['ParcelID_x','ParcelID_y'])
display(sale_date_df)


### Sale date and Estimated move in date

In [None]:
sale_date_df['Estimated Move in Date'] = sale_date_df.index + pd.to_timedelta(np.random.randint(30, 70, size=len(sale_date_df)), unit='D')
display(sale_date_df)

### Data + Days-difference and indexed

In [None]:
sale_date_df['days_difference'] = (sale_date_df['Estimated Move in Date'] - sale_date_df.index).dt.days
display(sale_date_df)

## Grouped by year

In [None]:
sale_date_df['year'] = sale_date_df.index.year
yearly_average = sale_date_df.groupby('year')['days_difference'].mean().reset_index()
display(yearly_average)

## Dataframe with Sale and Move in dates

In [None]:
housing_data = sale_date_df.reset_index()
housing_data = housing_data.drop(columns=['year'])
display(housing_data.tail(10))

### Monthly average

In [None]:
housing_data['month'] = housing_data['SaleDate'].dt.month
housing_data['year'] = housing_data['SaleDate'].dt.year
result = housing_data.groupby(['year', 'month'])['days_difference'].mean()
result = result.reset_index()

display(result)

In [None]:
start_date = '2020-01-01'
end_date = '2020-01-31'
mask = (housing_data['SaleDate'] >= start_date) & (housing_data['SaleDate'] <= end_date)
janurary_df = housing_data.loc[mask]
janurary_df = janurary_df.drop(columns=['month','year'])
display(janurary_df)
    

## Jan 2023 transactions

In [None]:
start_date = '2020-01-01'
end_date = '2020-01-31'
mask = (housing_data['SaleDate'] >= start_date) & (housing_data['SaleDate'] <= end_date)
janurary_df = housing_data.loc[mask]
janurary_df = janurary_df.drop(columns=['month','year'])
jan_plot = janurary_df.iloc[::50]

display(jan_plot)
display(janurary_df)


## Jan 2023 House sales

In [None]:
jan_plot.hvplot.hist(
    #x ='SaleDate',
    y ='days_difference',
    xlabel="Purchase Date",
    ylabel="Days Waiting to Move In",
    title="Histogram of Days Till Move In",
    rot = 75,
    width = 900, 
    height = 500,
    #stacked = True
    )