# Projeto de Machine Learning

In [232]:
import pandas as pd
housing_df = pd.read_csv('housing.csv')

In [233]:
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [234]:
X,y = housing_df.drop(labels='median_house_value',axis=1) , housing_df['median_house_value']

Train Test Split, para que não ocorra vazamento de dados de teste no treinamento

In [235]:
from sklearn.model_selection import train_test_split
RANDOM_SEED = 42

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=RANDOM_SEED)

In [236]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND


# Data Processing

In [237]:
# Separar apenas as variáveis categóricas (neste caso temos apenas uma).
X_train_cat = X_train[["ocean_proximity"]]
print(type(X_train_cat))
print(X_train_cat.head())


<class 'pandas.core.frame.DataFrame'>
      ocean_proximity
14196      NEAR OCEAN
8267       NEAR OCEAN
17445      NEAR OCEAN
14265      NEAR OCEAN
2271           INLAND


In [238]:
print(X_train_cat["ocean_proximity"].value_counts())



ocean_proximity
<1H OCEAN     7341
INLAND        5227
NEAR OCEAN    2086
NEAR BAY      1854
ISLAND           4
Name: count, dtype: int64


Usando o transformador OneHotEncoder do SciKitLearn na variável categórica, e criando um pipeline específica para essa transformação

In [239]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
        ('cat_encoder', OneHotEncoder(sparse=False, drop='first')),
    ])

housing_cat_tr = cat_pipeline.fit_transform(X_train_cat)
housing_cat_tr



array([[0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.]])

In [240]:
X_train_num = X_train.drop("ocean_proximity", axis=1)

# Feature Engeneering

In [241]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    # Column index.
    rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
    
    def __init__(self, add_bedrooms_per_room=True):  # No *args or **kwargs.
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self  # Nothing else to do.

    def transform(self, X, y=None):
        rooms_per_household = X[:, CombinedAttributesAdder.rooms_ix] / X[:, CombinedAttributesAdder.household_ix]
        population_per_household = X[:, CombinedAttributesAdder.population_ix] / X[:, CombinedAttributesAdder.household_ix]

        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, CombinedAttributesAdder.bedrooms_ix] / X[:, CombinedAttributesAdder.rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)


In [242]:
housing_extra_attribs = attr_adder.transform(X_train.values)

# Transformando em DataFrame, porque DataFrames são mais amigáveis.
columns_housing_extra_attribs = list(X_train.columns) + ["rooms_per_household", "population_per_household"]
housing_extra_attribs = pd.DataFrame(housing_extra_attribs, columns=columns_housing_extra_attribs)
housing_extra_attribs.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,population_per_household
0,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN,5.017657,3.691814
1,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN,4.473545,1.738095
2,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN,5.645833,2.723214
3,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN,4.002817,3.994366
4,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND,6.268421,2.3


In [243]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(X_train_num)
housing_num_tr

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.17491646,
         0.05137609, -0.2117846 ],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.40283542,
        -0.11736222,  0.34218528],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.08821601,
        -0.03227969, -0.66165785],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.60675918,
         0.02030568,  0.99951387],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.40217517,
         0.00707608, -0.79086209],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.85144571,
        -0.08535429,  1.69520292]])

In [244]:
list(X_train_num)

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [245]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_train_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

X_train_prepared = full_pipeline.fit_transform(X_train)



In [246]:
# For numerical attributes, column names remain the same
num_attribs = list(X_train_num)

# For categorical attributes, get the categories and make new names
cat_encoder = full_pipeline.named_transformers_['cat'].named_steps['cat_encoder']
cat_one_hot_attribs = list(cat_encoder.categories_[0])[1:]  # Skip the first category due to drop='first'

# Combine the lists to get full list of column names
column_names = num_attribs + ["rooms_per_household", "population_per_household","bedrooms_per_room"] + cat_one_hot_attribs

# Create a DataFrame
X_train_df = pd.DataFrame(X_train_prepared, columns=column_names)

X_train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,1.272587,-1.372811,0.34849,0.222569,0.211228,0.768276,0.322906,-0.326196,-0.174916,0.051376,-0.211785,0.0,0.0,0.0,1.0
1,0.709162,-0.876696,1.618118,0.340293,0.593094,-0.098901,0.672027,-0.035843,-0.402835,-0.117362,0.342185,0.0,0.0,0.0,1.0
2,-0.447603,-0.460146,-1.95271,-0.342597,-0.495226,-0.449818,-0.430461,0.144701,0.088216,-0.03228,-0.661658,0.0,0.0,0.0,1.0
3,1.232698,-1.382172,0.586545,-0.56149,-0.409306,-0.007434,-0.380587,-1.017864,-0.600015,0.077507,0.783032,0.0,0.0,0.0,1.0
4,-0.108551,0.532084,1.142008,-0.119565,-0.256559,-0.485877,-0.314962,-0.171488,0.349007,-0.068832,-0.550364,1.0,0.0,0.0,0.0


In [247]:
from sklearn.model_selection import train_test_split
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train_prepared, y_train, test_size=0.2, random_state=RANDOM_SEED)

Models

In [248]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Linear Regression
lin_reg = LinearRegression()


# Decision Tree Regressor
tree_reg = DecisionTreeRegressor(random_state=RANDOM_SEED)



# Random Forest Regressor
forest_reg = RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED)



## Cross-Validation

Cross-validation is a technique used to assess how well a machine learning model will generalize to an independent dataset. It is primarily used in settings where the goal is to predict an outcome and want to estimate how accurately a predictive model will perform in practice.

### How Does It Work?

1. **Partition the Dataset**: The original training dataset is divided into `k` equally (or almost equally) sized folds or subsets.
   
2. **Training and Validation**: The model is trained on `k-1` of these folds and validated on the remaining one. This process is repeated `k` times, each time with a different fold serving as the validation set.
   
3. **Average Performance**: The performance measure (e.g., accuracy, F1 score, etc.) for each of the `k` iterations is averaged to obtain a single score.

### Advantages

- **Better Generalization**: It provides a more robust way to assess the performance of a model on an independent dataset.
  
- **Utilizes Data Efficiently**: It allows for efficient use of data as each observation is used for both training and validation.

### Example Code in Python

```python
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train, cv=5)


In [252]:
from sklearn.model_selection import cross_val_score

lin_scores = cross_val_score(lin_reg, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

tree_scores = cross_val_score(tree_reg, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)

forest_scores = cross_val_score(forest_reg, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

In [253]:
def display_scores(scores):
    print("Scores (ordenados): [{}]".format(" ".join(["{:.2f}".format(x) for x in sorted(scores)])))
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())


In [254]:
display_scores(lin_rmse_scores)
display_scores(tree_rmse_scores)
display_scores(forest_rmse_scores)

Scores (ordenados): [65000.67 65218.78 65266.35 66089.63 67122.64 68402.55 68525.47 68957.34 70960.56 72739.88]
Mean: 67828.38677377408
Standard deviation: 2468.0913950652302
Scores (ordenados): [65333.63 65495.54 66052.18 66694.27 66867.79 68390.41 70707.54 70914.83 71226.93 74419.88]
Mean: 68610.29952929978
Standard deviation: 2899.0580212663745
Scores (ordenados): [47010.08 47210.02 47344.71 49308.39 49486.80 50163.43 50730.27 51674.69 51734.31 52782.81]
Mean: 49744.55091778306
Standard deviation: 1948.8394003839453


## 1. Definição do Problema
- Entendimento do problema e dos objetivos
- Identificação das variáveis-alvo
- Escolha das métricas de avaliação


## 3. Análise Exploratória de Dados (EDA)
- Estatísticas descritivas
- Visualização dos dados
- Identificação de padrões e anomalias

## 4. Pré-processamento de Dados
- Limpeza de dados
- Transformação de variáveis
- Tratamento de dados ausentes
- Normalização e padronização

## 5. Divisão dos Dados
- Conjunto de treino
- Conjunto de teste
- Conjunto de validação (opcional)

## 6. Seleção de Modelos
- Escolha de algoritmos apropriados
- Implementação de modelos de base (baseline)

## 7. Treinamento de Modelos
- Otimização de hiperparâmetros
- Treinamento com dados de treino

## 8. Avaliação de Modelos
- Utilização de métricas de avaliação
- Comparação com modelos de base (baseline)

## 9. Afinação do Modelo (Tuning)
- Ajuste de hiperparâmetros
- Feature engineering

## 10. Implementação
- Integração do modelo em um sistema de produção
- Monitoramento e manutenção

## 11. Documentação e Comunicação
- Documentação do projeto
- Comunicação dos resultados



# Aula 02 03 Regressão California Real State Prices

Medida de desempenho
$$RMSE(\mathbf{X}, \mathbf{y}, h) = 
\sqrt{\frac{1}{m} \sum_{i=1}^{m}\left(h(\mathbf{x}_i) - \mathbf{y}_i\right)^{2}}$$

Cross Validation

Ajuste Hiperparâmetros

Hiperparâmetros são configurações externas para um modelo de aprendizado de máquina que não são aprendidas a partir dos dados. Eles são como "configurações" para um algoritmo e influenciam o desempenho do modelo. Diferentemente dos parâmetros do modelo, que são derivados durante o treinamento, os hiperparâmetros são configurados antes do treinamento e permanecem constantes durante o processo.

Para Algoritmos de Gradiente Descendente
 - Taxa de aprendizado
 - Momento
 - Taxa de decaimento

Otimização de Hiperparâmetros
- *Grid Search*: Experimentação exaustiva por meio de uma grade predefinida de valores de hiperparâmetros.

## Dividindo os dados em conjunto de treinamento e de testes

# Aula 05 Classificação MNIST

In [None]:
from sklearn.metrics import confusion_matrix



# Ambos de teste

y_true = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
y_pred = [1, 1, 1, 0, 1, 0, 1, 0, 1, 0]

matc = confusion_matrix(y_true, y_pred)
matc

array([[4, 1],
       [0, 5]])

In [None]:
from sklearn.metrics import precision_score, recall_score
print(precision_score(y_true, y_pred))
print(recall_score(y_true, y_pred))

0.8333333333333334
1.0


Thus in binary classification, 

the count of true negatives is:math:`C_{0,0}`, 

false negatives is :math:`C_{1,0}`,

true positives is :math:`C_{1,1}` 

 false positives is :math:`C_{0,1}`.


### Métricas para avaliar desempenho em classificação

#### Matriz de confusão
\[
\begin{array}{|c|c|c|}
\hline
  & \text{Predito: Positivo} & \text{Predito: Negativo} \\
\hline
\text{Real: Positivo} & TP (\text{True Positive}) & FN (\text{False Negative}) \\
\hline
\text{Real: Negativo} & FP (\text{False Positive}) & TN (\text{True Negative}) \\
\hline
\end{array}
\]


A medida de acurácia não permite distinguir entre os tipos de erro. Duas medidas mais comuns que são empregadas em machine learning são a **precision** (precisão) e **recall** (revocação), definidas como:

- Precision: Dentre os elementos classificados como positivos, quantos realmente são positivos?

$$\text{precision} = \frac{TP}{TP + FP}$$

Alto Precision: Preza pela certeza que ao classificar uma amostar como positiva ela será positvo, e é mais permissível com erros onde positivos não são classificados como positivos. 

- Recall: Dentre os elementos verdadeiramente positivos, quantos foram detectados como positivos?

$$\text{recall} = \frac{TP}{TP + FN}$$

Alto Recall: Preza para que a maior parte possível de amostras positivas sejam classificados como positiva, e é mais permissível com erros onde negativos são classificados como positivos.


# Aula 9

Se as colunas de X são linearmentes dependentes então XtX tem determinante =0  e portando não é inversível


Theta Ótimo = (XtX) ^-1  XtY

## Sumário

1. [Introdução](#Introdução)
2. [Entendendo o Problema](#Entendendo-o-Problema)
3. [Coleta de Dados](#Coleta-de-Dados)
4. [Análise Exploratória de Dados](#Análise-Exploratória-de-Dados)
5. [Preparação dos Dados](#Preparação-dos-Dados)
6. [Seleção e Treinamento de Modelos](#Seleção-e-Treinamento-de-Modelos)
7. [Avaliação do Modelo](#Avaliação-do-Modelo)
8. [Afinação do Modelo](#Afinação-do-Modelo)
9. [Conclusão](#Conclusão)

## Grid Search

Grid Search is an optimization technique used for hyperparameter tuning in machine learning models. The idea is to define a "grid" of hyperparameters and then search exhaustively through all possible combinations in this grid.

### How Does It Work?

1. **Define Parameter Grid**: Create a dictionary containing all the hyperparameters you want to tune, and the possible values they can take.

2. **Exhaustive Search**: The algorithm trains a model for every combination of hyperparameters specified in the grid.

3. **Best Model**: After training, Grid Search selects the combination of hyperparameters that yield the best performance based on a specified metric (e.g., accuracy, F1 score, etc.).

### Advantages

- **Comprehensive**: Searches through all possible combinations in the defined grid.
  
- **Automated Tuning**: Saves time and effort in manual hyperparameter tuning.

### Limitations

- **Computationally Expensive**: The number of models trained is equal to the product of the number of values for each hyperparameter. 

- **No Guarantees**: The optimal hyperparameters may not be in the predefined grid.

### Example Code in Python

```python
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30]
}

# Create a GridSearch object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_


A Curva ROC é uma ferramenta gráfica utilizada para avaliar o desempenho de um modelo de classificação. Ela é traçada com o TPR (Taxa de Verdadeiros Positivos) no eixo Y e o FPR (Taxa de Falsos Positivos) no eixo X.

### Como Funciona?

1. **Calcular TPR e FPR**: Para cada ponto de corte possível do modelo, calculam-se as taxas de verdadeiros positivos (TPR) e falsos positivos (FPR).

2. **Traçar a Curva**: A curva é traçada no plano TPR vs FPR.

3. **Área Sob a Curva (AUC)**: A área sob a curva ROC (AUC-ROC) é uma métrica que quantifica o desempenho geral do modelo. Um valor de AUC igual a 1 indica um modelo perfeito, enquanto um valor de AUC igual a 0,5 indica um modelo que não tem capacidade de discriminação.

### Vantagens

- **Sensibilidade vs Especificidade**: A Curva ROC permite visualizar o trade-off entre sensibilidade (TPR) e especificidade (1 - FPR).

- **Comparação de Modelos**: É possível comparar o desempenho de diferentes modelos traçando suas curvas ROC no mesmo gráfico.
