In [None]:
# ==================
# Libraries
# ==================
!pip install jupyter_contrib_nbextensions #instalando nb-extensions
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim  
import plotly.express as px

In [15]:
# ==================
# Functions
# ==================
# REQUISITOS DE UMA FUNÇÃO
# 1 NOME - RESPONSABILIDADE
# 2 PARAMETROS DE ENTRADA
# 3 DADOS DE SAIDA

#data type verification
def show_dtypes(data):   ####SOMENTE USAR VARIAVEIS QUE ESTÃO DECLARADAS NO CABEÇALHO DE FUNÇÃO
  print(data.dtypes)
  return None

#show dimensions
def show_dimensions(data):
  print("Number of rows: ",data.shape[0], end='\n\n') 
  print("Number of columns:",data.shape[1],  end='\n\n')
  return None

#collect geodata
def collect_geodata(data, cols):
  #### UTILIZANDO A API GEOPY   
  geolocator = Nominatim(user_agent='geoapiExercises')
  
  data = data.head(20)

  #create empty
  data.loc[:, cols[0]] = 'NA'
  data.loc[:, cols[1]] = 'NA'

  for i in range(len(data)):
      print('Loop: {}/{}'.format(i, len(data))) ##### PRINT PRA IR ATUALIZANDO SOBRE O LOOP -- #DICA AO USAR CHAVES ELE PEGA OS VALORS DE FORA DA STRING
    
      coordinates = str(data.loc[i, 'lat']) + ',' + str(data.loc[i, 'long'])
      response = geolocator.reverse(coordinates) #API request - retorna um JSON, geralmente um DICTONARY

      if cols[0] in response.raw['address']:                                                   # o if é para caso a celula nao contenha a informaçao requerida, ele nao retornar nada e nao der erro.
          data.loc[i, 'house_number'] = response.raw['address'][cols[0]]          # se tiver ele popula a coluna, caso nao, mantem a celula vazia.
      if cols[1] in response.raw['address']:  
          data.loc[i, 'road'] = response.raw['address'][cols[1]]

      return data

def draw_map( data ):
  # map
  houses = data[['id', 'lat', 'long', 'price', 'niveis']].copy()
  fig = px.scatter_mapbox( houses,
  lat="lat",
  lon="long",
  color="niveis",
  size="price",
  color_discrete_sequence=px.colors.qualitative.Plotly,
  size_max=15,
  zoom=10)
  fig.update_layout(mapbox_style="open-street-map")
  fig.update_layout(height=600, margin={"r":0,"t":0,"l":0,"b":0})
  return fig

In [None]:
# ==================
# Extraction
# ==================
#load dataset
from google.colab import files
uploaded = files.upload()
#reading and assigning data to a variable
import io
data = pd.read_csv(io.BytesIO(uploaded['kc_house_data.csv']))

# # Extraction Analysis
# # Dimensions
# show_dimensions(data)
# # Types
# show_dtypes(data)

In [17]:
#==================
#Transformation
#==================
#convert objecto to date
data['date'] = pd.to_datetime(data['date'])

#arranging cientific notes
pd.set_option('display.float_format', lambda x: '%.2f' % x)

#descriptive statistics - selecionando apenas os tipos que são INT ou FLOAT para tirar a média deles
sub_attributes = data.select_dtypes(include=['int64','float64'])

#central tendency - mean, median
mean = pd.DataFrame(sub_attributes.apply(np.mean))
median = pd.DataFrame(sub_attributes.apply(np.median))

#dispersions - std (desvio padrao), minimo, maximo

std = pd.DataFrame(sub_attributes.apply(np.std))
min_ = pd.DataFrame(sub_attributes.apply(np.min))
max_ = pd.DataFrame(sub_attributes.apply(np.max))

df1 = pd.concat([max_,min_,mean, median,std], axis=1).reset_index()
df1.columns = ['attributes', 'maximo', 'minimo', 'media', 'mediana', 'std'] 


# Column for dormitory type
data['dormitory_type'] = data['bedrooms'].apply( lambda x: 'studio' if x == 1 else
                                                           'apartment' if x == 2 else
                                                           'house' if x > 2 else 'NA')

#Column for price levels
for i in range(len(data)):
    if (data.loc[i, 'price'] >= 0) & (data.loc[i, 'price'] < 321950):
      data.loc[i, 'niveis'] = 'nivel_0'

    elif (data.loc[i, 'price'] >= 321950) & (data.loc[i, 'price'] < 450000):
      data.loc[i, 'niveis'] = 'nivel_1'

    elif (data.loc[i, 'price'] >= 450000) & (data.loc[i, 'price'] < 645000):
      data.loc[i, 'niveis'] = 'nivel_2'

    else:
      data.loc[i, 'niveis'] = 'nivel_3'



In [None]:
# ==================
# Load
# ==================
# draw a map
fig = draw_map( data )
fig.show()
