In [1]:
# importando as bibliotecas necessárias
import os
import tarfile
import urllib
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib as plt

In [2]:
# url do github
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
#url + caminho do arquivo
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
#função baixar o aruivo, criar caminho local e descompactar 
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
#função para ler o data set hounsing 
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)    
    
    
    

In [3]:
#baixando, criando e descompactando base de dados
fetch_housing_data()

In [4]:
#atribuindo o df a variável housing
housing = load_housing_data()

In [10]:
# para visualizar o dataframe completo quando for de muitas colunas e linhas
'''pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',500)
pd.set_option('display.width',1000)'''

housing.head(10) # visualizando primeiras 10 linhas do dataframe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY


In [12]:
# total de linhas x colunas do dataframe
housing.shape

(20640, 10)

In [14]:
# resumo do df
housing.info() 
# 1 atributo categórico e 1 atributo com dados nulos

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [16]:
# verificando quais e quantas categorias existem na variável categórica ocean_proximity
housing["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [17]:
'''
Temos atibutos com baixa e alta variabilidade verificando os valores mínimos e máximos,

'''
housing.describe()  # resumo estatístico do df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [None]:
# verificando histograma das variáveis

'''
Podemos afirmar que temos mais de 1200 distritos com media de idade com 52 anos.

obs: cuidado com o número de bins, pois pode confundir a análise.
'''
import matplotlib.pyplot as plt
%matplotlib inline
housing.hist(bins=75, figsize=(20, 15), legend=True);

In [None]:
# Separando o dataframe em conjunto de treino e teste com 20% para treino usando sklearn
# Usando random_state=42 teremos um mesmo conjunto para treino e um mesmo conjunto para teste
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
# Verificando tamanho dos conjuntos de treino e teste respectivamente
train_set.shape, test_set.shape

In [None]:
'''
Criando um atributo categórico de renda com cinco categorias rotuladas de 1 a 5
sendo que a categoria 1 varia de 0 a 1,5(US$ 15.000) 
categoria 2 de 1,5 a 3 e assim por diante.
Visto que queremos ter a mesma proporção de amostragem entre as categorias
de renda média afim de evitar viés de amostragem
'''
housing["renda_mediana_cat"] = pd.cut(housing["renda_mediana"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [None]:
housing["renda_mediana_cat"].hist()


In [None]:
'''
Criando conjunto de treino e teste com amostras estratificadas mantendo a 
mesma proporão do conjunto de dados completo por categoria
'''

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["renda_mediana_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set["renda_mediana_cat"].value_counts() / len(strat_test_set)

In [None]:
strat_train_set["renda_mediana_cat"].value_counts() / len(strat_train_set)

In [None]:
# Funçao para comparar as proporções


def income_cat_proportions(data):
    return data["renda_mediana_cat"].value_counts() / len(data)


train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({"Overall": income_cat_proportions(housing),
                              "Stratified": income_cat_proportions(strat_test_set),
                              "Random": income_cat_proportions(test_set), }).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [None]:
'''
Comparando a proporção do conjunto completo, aleátorio eo estratificado
de acordo com as categorias de renda mediana
'''
compare_props

In [None]:
'''
Após ter gerado o conjunto de treino e teste estratificado
vamos remover a variável criada para separar por categoria

'''

for set_ in (strat_train_set, strat_test_set):
    set_.drop("renda_mediana_cat", axis=1, inplace=True)

In [None]:
housing = strat_train_set.copy()

#  EDA

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(7,5), alpha=0.1);


In [None]:
'''
Valor médio das casas em todo território
'''

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
             s=housing["populacao"]/100, label="População", figsize=(10,7),
             c="valor_medio_casa", cmap=plt.get_cmap("jet"), colorbar=True,)
plt.legend()

In [None]:
housing.corr()

In [None]:
corr_matrix = housing.corr()
corr_matrix["valor_medio_casa"].sort_values(ascending=False)

In [None]:
#scatter_matrix(housing, figsize=(10,12));
attributes = ["valor_medio_casa", "renda_mediana", "total_salas","idade_mediana"]
scatter_matrix(housing[attributes], figsize=(12, 8));


In [None]:
housing.plot(kind="scatter", x="renda_mediana", y="valor_medio_casa",alpha=0.1);

In [None]:
housing["quarto/familia"] = housing["total_salas"]/housing["familias"]
housing["quarto/sala"] = housing["total_quartos"]/housing["total_salas"]
housing["populacao/familia"]=housing["populacao"]/housing["familias"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["valor_medio_casa"].sort_values(ascending=False)

In [None]:
housing = strat_train_set.drop("valor_medio_casa", axis=1)
housing_labels = strat_train_set["valor_medio_casa"].copy()

In [None]:
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("litoral", axis=1)
imputer.fit(housing_num)

In [None]:
housing_num

In [None]:
imputer.statistics_
housing_num.median().values