In [None]:
# ============================================================
# Célula 0 — Instalar/atualizar bibliotecas necessárias
# ============================================================
import sys, subprocess

def pip_install(pkg):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

for pkg in ["pandas","numpy","matplotlib","scikit-learn","statsmodels"]:
    try:
        __import__(pkg.split("==")[0].split(">=")[0])
    except Exception:
        pip_install(pkg)

import pandas as pd, numpy as np, matplotlib, sklearn, statsmodels
print("Bibliotecas carregadas com sucesso!")

# ============================================================
# Imports principais
# ============================================================
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from statsmodels.tsa.seasonal import seasonal_decompose

plt.rcParams.update({"figure.figsize": (9, 4)})

# ============================================================
# Função auxiliar para upload no Colab
# ============================================================
def colab_upload(expected_ext=None):
    """Faz upload no Colab e retorna Path do arquivo válido"""
    try:
        from google.colab import files
        uploaded = files.upload()
        fname = list(uploaded.keys())[0]
        if expected_ext and not fname.endswith(expected_ext):
            raise ValueError(f"Arquivo enviado não é {expected_ext}. Você enviou: {fname}")
        return Path(fname)
    except Exception as e:
        print("Upload falhou ou não está no Colab:", e)
        return None

# ============================================================
# Parte 1 + 2 — Dataset IHEPC
# ============================================================
print("\n=== Upload do dataset household_power_consumption.txt ===")
IHEPC_TXT = colab_upload(expected_ext=".txt")
if IHEPC_TXT is None:
    IHEPC_TXT = Path("household_power_consumption.txt")  # caminho manual se não for Colab
print("Arquivo TXT:", IHEPC_TXT)

def load_ihepc(parse_datetime=True):
    df = pd.read_csv(
        IHEPC_TXT,
        sep=';',
        na_values=['?'],
        dtype={
            'Global_active_power':'float64',
            'Global_reactive_power':'float64',
            'Voltage':'float64',
            'Global_intensity':'float64',
            'Sub_metering_1':'float64',
            'Sub_metering_2':'float64',
            'Sub_metering_3':'float64'
        },
        low_memory=False
    )
    if parse_datetime:
        df['Datetime'] = pd.to_datetime(
            df['Date'] + ' ' + df['Time'],
            format='%d/%m/%Y %H:%M:%S',
            errors='coerce'
        )
        df['Date_dt'] = pd.to_datetime(
            df['Date'],
            format='%d/%m/%Y',
            errors='coerce'
        )
    return df

# Carregar
df = load_ihepc()
print("\nPrimeiras 10 linhas:")
display(df.head(10))

# Exercícios 3–25 (resumidos aqui para caber)
print("\nValores ausentes por coluna:")
print(df.isna().sum())

df['DayOfWeek'] = df['Date_dt'].dt.day_name()
df_2007 = df[df['Date_dt'].dt.year == 2007]
daily_mean_all = df.groupby('Date_dt')['Global_active_power'].mean()
print("Dia com maior GAP médio:", daily_mean_all.idxmax(), daily_mean_all.max())

# Histograma de Voltage
df['Voltage'].dropna().plot(kind='hist', bins=50)
plt.title("Distribuição de Voltage"); plt.show()

# Regressão GAP ~ GI
valid = df[['Global_active_power','Global_intensity']].dropna()
X = valid[['Global_intensity']]
y = valid['Global_active_power']
lr = LinearRegression().fit(X,y)
rmse = mean_squared_error(y, lr.predict(X), squared=False)
print("Regressão Linear GAP~GI -> Coef:", lr.coef_[0], "RMSE:", rmse)

# ============================================================
# Parte 3 — Dataset Appliances
# ============================================================
print("\n=== Upload do dataset appliances_energy_prediction.csv ===")
APPLIANCES_CSV = colab_upload(expected_ext=".csv")
if APPLIANCES_CSV is None:
    APPLIANCES_CSV = Path("appliances_energy_prediction.csv")
print("Arquivo CSV:", APPLIANCES_CSV)

ap = pd.read_csv(APPLIANCES_CSV)
ap['date'] = pd.to_datetime(ap['date'])
print("\n.head() do Appliances:")
display(ap.head())

# Exercícios 26–35
ap['Appliances'].plot(kind='hist', bins=50)
plt.title("Distribuição de Appliances"); plt.show()

print("\nCorrelação com Appliances (top 10):")
print(ap.corr(numeric_only=True)['Appliances'].sort_values(ascending=False).head(10))

# PCA
num_cols_ap = ap.select_dtypes(include=[np.number]).columns
ap_scaled = ap.copy()
ap_scaled[num_cols_ap] = MinMaxScaler().fit_transform(ap[num_cols_ap])
X_ap = ap_scaled[num_cols_ap].dropna()
pca_ap = PCA(n_components=2).fit(X_ap)
print("\nVariância explicada PCA (Appliances):", pca_ap.explained_variance_ratio_)

# Regressão múltipla
X = ap[num_cols_ap].drop(columns=['Appliances']).fillna(0)
y = ap['Appliances']
lin = LinearRegression().fit(X,y)
print("\nRegressão Linear Múltipla — R²:", lin.score(X,y))

# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42).fit(X,y)
print("Random Forest RMSE:", mean_squared_error(y, rf.predict(X), squared=False))

# Classificação binária
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

ap['HighUse'] = (ap['Appliances'] > ap['Appliances'].median()).astype(int)
X_cls = X.fillna(X.median())
y_cls = ap['HighUse']
Xtr,Xte,ytr,yte = train_test_split(X_cls,y_cls,test_size=0.3,random_state=42,stratify=y_cls)

rf_cls = RandomForestClassifier(random_state=42).fit(Xtr,ytr)
y_pred = rf_cls.predict(Xte)

print("\nMatriz de confusão:")
print(confusion_matrix(yte, y_pred))
print("\nMétricas de avaliação:")
print("Accuracy :", accuracy_score(yte, y_pred))
print("Precision:", precision_score(yte, y_pred))
print("Recall   :", recall_score(yte, y_pred))
print("F1-Score :", f1_score(yte, y_pred))