# 0.1 imports

In [28]:
import pandas as pd
import seaborn as sns

from datetime    import datetime
import matplotlib.pyplot         as plt
from IPython.core.display        import HTML


# 0.2 Helper Functions

In [17]:
def jupyter_settings(): 
    %matplotlib inline 
    #%matplotlib notebook
    %pylab inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
     
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option ('display.expand_frame_repr', False)
    
    sns.set()
    

In [18]:
%matplotlib inline 
%matplotlib notebook

plt.style.use('bmh')
plt.rcParams['figure.figsize'] = [25, 12]
plt.rcParams['font.size'] = 24


display( HTML( '<style>.container { width:100% !important; }</style>') )
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.set_option ('display.expand_frame_repr', False)

sns.set()

jupyter_settings()

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


# 1.0 Loading Data 

In [21]:
df1 = pd.read_csv ('/Users/adriele/Documents/repos/house_rocket/dataset/kc_house_data.csv')

## 1.2 Data Dimersions

In [25]:
print( 'Number of Cols {}'.format( df1.shape[0] ) ) 
print( 'Number of Rows {}'.format( df1.shape[1] ) )

Number of Cols 21613
Number of Rows 21


## 1.3 Check NA

In [26]:
df1.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

## 1.4 Check Data Type

In [27]:
df1.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

# 1.5 Changing Data Types

In [39]:
df1['date'] = pd.to_datetime(df1['date'])

In [40]:
df1.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,2013-10-14,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,2012-09-14,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,2025-02-15,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,2012-09-14,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,2018-02-15,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [41]:
df1.dtypes

id                        int64
date             datetime64[ns]
price                   float64
bedrooms                  int64
bathrooms               float64
sqft_living               int64
sqft_lot                  int64
floors                  float64
waterfront                int64
view                      int64
condition                 int64
grade                     int64
sqft_above                int64
sqft_basement             int64
yr_built                  int64
yr_renovated              int64
zipcode                   int64
lat                     float64
long                    float64
sqft_living15             int64
sqft_lot15                int64
dtype: object

In [60]:
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)

In [61]:
from platform import python_version
import streamlit as st
import pandas as pd
import numpy as np
import folium
import geopandas
import plotly.express as px


from streamlit_folium import folium_static
from folium.plugins import MarkerCluster
from datetime import datetime
import folium.features

In [62]:

st.title ('House Rocket Company')
st.markdown ( 'Welcome to Rouse Rocket Data Analysis')
# st.set_page_config ( layuot = 'wide')

@st.cache(allow_output_mutation=True)
def get_data(path):
    data = pd.read_csv(path)
    return data

def get_geofile (url) :
    geofile= geopandas.read_file( url )
    return geofile

def set_feature (data):
    # add new features
    data['price_m2'] = data['price'] / data['sqft_lot']
    return data

def overview_data (data):
    # Data Overview
    f_attributes = st.sidebar.multiselect('Enter columns', data.columns)
    f_zipcode = st.sidebar.multiselect('Enter zipcode', data['zipcode'].unique())

    st.title('Data Overview')

    if (f_zipcode != []) & (f_attributes != []):
        data = data.loc[data['zipcode'].isin(f_zipcode), f_attributes]

    elif (f_zipcode != []) & (f_attributes == []):
        data = data.loc[data['zipcode'].isin(f_zipcode):]

    elif (f_zipcode == []) & (f_attributes != []):
        data = data.loc[:, f_attributes]

    else:
        data = data.copy()
    st.dataframe(data)

    c1, c2 = st.columns((1, 1))

    # Averange metrics
    df1 = data[['id', 'zipcode']].groupby('zipcode').count().reset_index()
    df2 = data[['price', 'zipcode']].groupby('zipcode').mean().reset_index()
    df3 = data[['sqft_living', 'zipcode']].groupby('zipcode').mean().reset_index()
    df4 = data[['price_m2', 'zipcode']].groupby('zipcode').mean().reset_index()

    # Merge C1
    m1 = pd.merge(df1, df2, on='zipcode', how='inner')
    m2 = pd.merge(m1, df3, on='zipcode', how='inner')
    df = pd.merge(m2, df4, on='zipcode', how='inner')


    df.columns = ['ZIPCODE', 'TOTAL HOUSES', 'PRICE', 'SQFT LIVING', 'PRICE/2']
    #st.write(df.head())
    c1.header('Avarage Values')
    c1.dataframe(df, height=400)

    # startstic Descriptive C2
    num_attributes = data.select_dtypes(include=['int64', 'float64'])
    media = pd.DataFrame(num_attributes.apply(np.mean))
    mediana = pd.DataFrame(num_attributes.apply(np.median))
    std = pd.DataFrame(num_attributes.apply(np.std))

    max_ = pd.DataFrame(num_attributes.apply(np.max))
    min_ = pd.DataFrame(num_attributes.apply(np.min))

    df1 = pd.concat([max_, min_, media, mediana, std], axis=1).reset_index()
    df1.columns = ['attributes', 'max', 'min', 'mean', 'median', 'std']

    c2.header('Descriptive Analysis')
    c2.dataframe(df1, height=400)

    return None

def portfolio_density  (data, geofile):
    # Densidade de portifolio
    st.title('Region Overview')
    c1, c2 = st.beta_columns((1, 1))
    c1.header('Portfolio Density')

    df = data.sample(10)
    # Base map
    density_map = folium.Map(location=[data['lat'].mean(),
                             data['long'].mean()],
                             default_zoom_start=15)

    marker_cluster = MarkerCluster().add_to(density_map)
    for name, row in df.iterrows():
        folium.Marker([row['lat'], row['long']],
                      popup='Sold R${0} on: {1}. Features: {2} sqft, {3} bedrooms, {4} bathrooms, year built {5}'.format(row['price'],
                                                               row['date'],
                                                               row['sqft_living'],
                                                               row['bedrooms'],
                                                               row['bathrooms'],
                                                               row['yr_built'])).add_to(marker_cluster)

    with c1:
        folium_static(density_map)

    # Region price
    c2.header('Price Density')

    df = data[['price', 'zipcode']].groupby('zipcode').mean().reset_index()
    df.columns = ['ZIP', 'PRICE']

    #df = df.sample(10)
    geofile = geofile[geofile['ZIP'].isin(df['ZIP'].tolist())]

    region_price_map = folium.Map(location=[data['lat'].mean(),
                                            data['long'].mean()],
                                  default_zoom_start=15)

    region_price_map.choropleth(data=df,
                                geo_data=geofile,
                                columns=['ZIP', 'PRICE'],
                                key_on='feature.properties.ZIP',
                                fill_color='YlOrBr',
                                fill_opacity=0.9,
                                line_opacity=0.5,
                                legend_name='AVG PRICE')


    with c2:
        folium_static(region_price_map)

def commercial_distribution (data):

    # Distribicao do imoveis por categoria

    st.sidebar.title('Comercial Option')
    st.title('Comercial Attribute')

    # ---------- Avarange Price per Year

    data['date'] = pd.to_datetime(data['date']).dt.strftime('%y-%m-%d')

    # Filters
    min_year_built = int(data['yr_built'].min())
    max_year_built = int(data['yr_built'].max())

    st.sidebar.subheader('Select Max Year Built')
    f_year_built = st.sidebar.slider('Year Built',
                                     min_year_built,
                                     max_year_built,
                                     min_year_built)
    st.header('Avarange Price per Year Built')

    # data select
    df = data.loc[data['yr_built'] < f_year_built]
    df = df[['yr_built', 'price']].groupby('yr_built').mean().reset_index()

    # plot
    fig = px.line(df, x='yr_built', y='price')
    st.plotly_chart(fig, use_container_width=True)

    # ---------- Avarange Price per day
    st.header('Avarage Price per Day')
    st.sidebar.subheader('Select Max Date')

    # fielters
    min_date = datetime.strptime(data['date'].min(), '%y-%m-%d')
    max_date = datetime.strptime(data['date'].max(), '%y-%m-%d')

    f_date = st.sidebar.slider('Date', min_date, max_date, min_date)

    # data filtering
    data['date'] = pd.to_datetime(data['date'])
    df = data.loc[data['date'] < f_date]
    df = df[['date', 'price']].groupby('date').mean().reset_index()

    # plot
    fig = px.line(df, x='date', y='price')
    st.plotly_chart(fig, use_container_width=True)

    # ----------------- Histograma
    st.header('Price Distribution')
    st.subheader('Select Max Price')

    # filter
    min_price = int(data['price'].min())
    max_price = int(data['price'].max())
    avg_price = int(data['price'].mean())

    # data filterin
    f_price = st.sidebar.slider('Price', min_price, max_price, avg_price)
    df = data.loc[data['price'] < f_price]

    # data plot
    fig = px.histogram( df, x='price', nbins=50 )
    st.plotly_chart( fig, use_container_width=True )

    return None

def attributes_distribution (data):
    # ================================================
    # Distribuicao dos imoveis por categoria fisica
    # ================================================
    st.sidebar.title('Attributes Options')
    st.title('House Attributes')

    # Filters
    f_bedrooms = st.sidebar.selectbox('Max number of bedrooms',
                                      sorted(set(data['bedrooms'].unique())))
    f_bathrooms = st.sidebar.selectbox('Max number of bedrooms',
                                       sorted(set(data['bathrooms'].unique())))
    c1, c2 = st.beta_columns(2)

    # House per bedrooms
    c1.header('Houses Bedroooms')
    df = data[data['bedrooms'] < f_bedrooms]
    fig = px.histogram(data, x='bedrooms', nbins=19)
    c1.plotly_chart(fig, use_container_width=True)

    # House per bathrooms
    c2.header('Houses Bathrooms')
    df = data[data['bathrooms'] < f_bathrooms]
    px.histogram(df, x='bathrooms', nbins=50)
    c2.plotly_chart(fig, use_container_width=True)

    # Filters
    f_floors = st.sidebar.selectbox('Max number of floor',
                                    sorted(set (data['floors'].unique())))
    f_waterview = st.sidebar.checkbox('Only Houses with Water View')
    c1, c2 = st.beta_columns(2)

    # House per Floors
    c1.header('Houses per floor')
    df = data [data['floors']> f_floors]
    # Plot
    px.histogram(df, x='floors', nbins=50)
    st.plotly_chart(fig, use_container_width=True)

    # House per water view
    if f_waterview:
        df = data[data['waterfront'] == 1]

    else:
        df = data.copy()

    fig = px.histogram(df, x='waterfront', nbins=10)
    c2.plotly_chart(fig, use_container_width=True)
    return None

if __name__== '__main__':
    # ETL
    # Data Extraction
    path = '/Users/adriele/Documents/repos/python_zero_ao_ds/house_rocket/dataset/kc_house_data.csv'
    url = 'http://data-seattlecitygis.opendata.arcgis.com/datasets/83fc2e72903343aabff6de8cb445b81c_2.geojson'
    #url = '/Users/adriele/Documents/repos/python_zero_ao_ds/house_rocket/dataset/Zip_Codes.geojson'
    data = get_data(path)
    geofile = get_geofile(url)

    # Transformation
    data = set_feature (data)

    overview_data (data)

    portfolio_density (data, geofile)

    commercial_distribution (data)

    attributes_distribution (data)
    # Loading


# st.write (f_attributes)
# st.write (data.head ())



InternalHashError: module '__main__' has no attribute '__file__'

While caching the body of `get_data()`, Streamlit encountered an
object of type `builtins.function`, which it does not know how to hash.

**In this specific case, it's very likely you found a Streamlit bug so please
[file a bug report here.]
(https://github.com/streamlit/streamlit/issues/new/choose)**

In the meantime, you can try bypassing this error by registering a custom
hash function via the `hash_funcs` keyword in @st.cache(). For example:

```
@st.cache(hash_funcs={builtins.function: my_hash_func})
def my_func(...):
    ...
```

If you don't know where the object of type `builtins.function` is coming
from, try looking at the hash chain below for an object that you do recognize,
then pass that to `hash_funcs` instead:

```
Object of type builtins.function: <function get_data at 0x7fa5ac1bf0d0>
```

Please see the `hash_funcs` [documentation]
(https://docs.streamlit.io/library/advanced-features/caching#the-hash_funcs-parameter)
for more details.
            