# Projeto Final

Carolina Abdu e Mariana Meirelles

## Import de Bibliotecas

In [8]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import gamma, norm

## Pré-processamento dos Dados

In [9]:
df = pd.read_csv("./ndt_tests_corrigido.csv")
display(df)


Unnamed: 0,timestamp,download_throughput_bps,rtt_download_sec,upload_throughput_bps,rtt_upload_sec,packet_loss_percent,client,server
0,2025-08-09 15:28:02.000 +0000,8.223657e+07,0.231862,6.973217e+07,0.247727,0.000000,client12,server06
1,2025-08-09 15:30:11.000 +0000,9.027315e+08,0.012000,8.521780e+08,0.005423,0.008226,client01,server07
2,2025-08-10 04:27:43.000 +0000,5.910655e+08,0.014000,2.812188e+08,0.014544,5.954284,client13,server07
3,2025-08-09 22:45:07.000 +0000,6.721139e+08,0.011000,1.135400e+08,0.010482,0.261528,client12,server07
4,2025-08-10 04:49:21.000 +0000,8.122087e+08,0.009000,6.857905e+08,0.009368,1.381646,client03,server03
...,...,...,...,...,...,...,...,...
7082,2025-08-07 16:05:20.000 +0000,-1.000000e+00,-0.001000,6.486659e+08,0.006574,-1.000000,client04,server07
7083,2025-08-18 10:07:18.000 +0000,-1.000000e+00,-0.001000,6.790900e+08,0.005833,-1.000000,client04,server07
7084,2025-08-20 08:59:39.000 +0000,-1.000000e+00,-0.001000,8.799421e+08,0.013070,-1.000000,client04,server02
7085,2025-08-25 04:39:11.000 +0000,-1.000000e+00,-0.001000,6.481266e+08,0.009771,-1.000000,client03,server02


1. Remoção de valores ausentes

In [10]:
df = df.dropna()

2. Eliminação de valores negativos

In [11]:
df = df[(df.select_dtypes(include='number') >= 0).all(axis=1)].copy()
display(df)


Unnamed: 0,timestamp,download_throughput_bps,rtt_download_sec,upload_throughput_bps,rtt_upload_sec,packet_loss_percent,client,server
0,2025-08-09 15:28:02.000 +0000,8.223657e+07,0.231862,6.973217e+07,0.247727,0.000000,client12,server06
1,2025-08-09 15:30:11.000 +0000,9.027315e+08,0.012000,8.521780e+08,0.005423,0.008226,client01,server07
2,2025-08-10 04:27:43.000 +0000,5.910655e+08,0.014000,2.812188e+08,0.014544,5.954284,client13,server07
3,2025-08-09 22:45:07.000 +0000,6.721139e+08,0.011000,1.135400e+08,0.010482,0.261528,client12,server07
4,2025-08-10 04:49:21.000 +0000,8.122087e+08,0.009000,6.857905e+08,0.009368,1.381646,client03,server03
...,...,...,...,...,...,...,...,...
7077,2025-08-30 23:24:52.000 +0000,8.250996e+08,0.004822,8.540814e+08,0.004414,0.000000,client01,server07
7078,2025-08-30 23:35:01.000 +0000,9.067732e+08,0.003719,7.891003e+08,0.003797,0.902384,client06,server07
7079,2025-08-30 23:44:20.000 +0000,1.067567e+08,0.117000,3.867771e+08,0.007549,0.384653,client07,server07
7080,2025-08-30 23:44:28.000 +0000,9.034447e+08,0.004296,6.387240e+08,0.004337,0.041953,client03,server07


3. Conversão de unidades e padronização

In [12]:
df['packet_loss_fraction'] = df['packet_loss_percent'] / 100

## Seção 6 

### Seção 6.1 - (LRT) Modelo Gama–Gama para throughput

In [None]:
tp_a = df[df['client'] == 'client05']['download_throughput_bps']
tp_b = df[df['client'] == 'client12']['download_throughput_bps']

# Estatísticas
tp_combined = np.concatenate([tp_a, tp_b])

params_h0 = gamma.fit(tp_combined, floc=0) 
log_l0 = np.sum(gamma.logpdf(tp_combined, *params_h0))

print(f"Item 1 - Log-Verossimilhança H0 (Gama): {log_l0:.4f}")
print(f"Parâmetros H0 (shape, loc, scale): {params_h0}")

# Item 2
params_h1_a = gamma.fit(tp_a, floc=0)
params_h1_b = gamma.fit(tp_b, floc=0)

log_l1_a = np.sum(gamma.logpdf(tp_a, *params_h1_a))
log_l1_b = np.sum(gamma.logpdf(tp_b, *params_h1_b))
log_l1 = log_l1_a + log_l1_b

print(f"\nItem 2 - Log-Verossimilhança H1 (Gama): {log_l1:.4f}")
print(f"Parâmetros H1 Client A: {params_h1_a}")
print(f"Parâmetros H1 Client B: {params_h1_b}")

# Item 3
w_obs = -2 * (log_l0 - log_l1)

print(f"\nItem 3 - Validação de W:")
print(f"Estatística W_obs: {w_obs:.4f}")

# Item 4
alpha = 0.05
df_gl = 2 # Graus de liberdade

valor_critico = stats.chi2.ppf(1 - alpha, df=df_gl)
p_valor = 1 - stats.chi2.cdf(w_obs, df=df_gl)

print(f"\nItem 4 - Estatísticas de Teste:")
print(f"Graus de Liberdade: {df_gl}")
print(f"Valor Crítico (Chi2, alpha=5%): {valor_critico:.4f}")
print(f"P-valor: {p_valor:.4e}")

if w_obs > valor_critico:
    print("\nCONCLUSÃO: Rejeitamos H0. As distribuições (e médias) são estatisticamente diferentes sob o modelo Gama.")
else:
    print("\nCONCLUSÃO: Não rejeitamos H0. Não há evidência de diferença entre os grupos.")

Item 1 - Log-Verossimilhança H0 (Gama): -23324.2538
Parâmetros H0 (shape, loc, scale): (1.2449657706899, 0, np.float64(319750574.4401329))

Item 2 - Log-Verossimilhança H1 (Gama): -23289.0015
Parâmetros H1 Client A: (1.1131683448366672, 0, np.float64(278063035.6667998))
Parâmetros H1 Client B: (1.484816464696758, 0, np.float64(313012352.60810274))

Item 3 - Validação de W:
Estatística W_obs: 70.5047

Item 4 - Estatísticas de Teste:
Graus de Liberdade: 2
Valor Crítico (Chi2, alpha=5%): 5.9915
P-valor: 4.4409e-16

CONCLUSÃO: Rejeitamos H0. As distribuições (e médias) são estatisticamente diferentes sob o modelo Gama.


### Seção 6.2 - (LRT) Modelo Normal–Normal para RTT

In [None]:
rtt_a = df[df['client'] == 'client05']['rtt_download_sec']
rtt_b = df[df['client'] == 'client12']['rtt_download_sec']

# Estatísticas básicas
n_a = len(rtt_a)
n_b = len(rtt_b)
mean_a = np.mean(rtt_a)
mean_b = np.mean(rtt_b)

# Sob H0, as amostras vêm da mesma distribuição (pool de dados)
data_all = np.concatenate([rtt_a, rtt_b])

# Variância MLE (sigma^2) do conjunto combinado: sum((x - mu)^2) / n
sigma2_mle = np.var(data_all) 
sigma_mle = np.sqrt(sigma2_mle)


print(f"Cliente 05: n={n_a}, média={mean_a:.6f}")
print(f"Cliente 12: n={n_b}, média={mean_b:.6f}")
print(f"Variância (sigma^2) MLE Combinada: {sigma2_mle:.6e}")

# Item 1 
mu_h0 = np.mean(data_all)
log_l0 = np.sum(norm.logpdf(data_all, loc=mu_h0, scale=sigma_mle))

print(f"\nItem 1 - Log-Verossimilhança H0: {log_l0:.4f}")

# Item 2
log_l1_a = np.sum(norm.logpdf(rtt_a, loc=mean_a, scale=sigma_mle))
log_l1_b = np.sum(norm.logpdf(rtt_b, loc=mean_b, scale=sigma_mle))
log_l1 = log_l1_a + log_l1_b

print(f"Item 2 - Log-Verossimilhança H1: {log_l1:.4f}")

# Item 3 
w_definicao = -2 * (log_l0 - log_l1)

termo_n = (n_a * n_b) / (n_a + n_b)
w_obs_formula = (1 / sigma2_mle) * termo_n * (mean_a - mean_b)**2

print(f"\nItem 3 - Validação de W:")
print(f"W (pela definição Log-Razão):      {w_definicao:.4f}")
print(f"W (pela fórmula):    {w_obs_formula:.4f}")
print(f"W/Wobs:   {w_definicao/w_obs_formula:.4f}")

# Item 4 
alpha = 0.05
valor_critico = stats.chi2.ppf(1 - alpha, df=1)
p_valor = 1 - stats.chi2.cdf(w_obs_formula, df=1)


print(f"\nItem 4 - Resultado do Teste:")
print(f"Estatística W observada: {w_obs_formula:.4f}")
print(f"Valor Crítico (Chi2, df=1): {valor_critico:.4f}")
print(f"P-valor: {p_valor:.4e}")

if w_obs_formula > valor_critico:
    print("\nConclusão: REJEITAMOS H0. As médias de RTT são significativamente diferentes.")
else:
    print("\nConclusão: NÃO REJEITAMOS H0. Não há diferença estatística entre as médias de RTT.")

Cliente 05: n=482, média=0.045166
Cliente 12: n=640, média=0.086485
Variância (sigma^2) MLE Combinada: 7.849080e-03

Item 1 - Log-Verossimilhança H0: 1127.3194
Item 2 - Log-Verossimilhança H1: 1157.2203

Item 3 - Validação de W:
W (pela definição Log-Razão):      59.8018
W (pela fórmula):    59.8018
W/Wobs:   1.0000

Item 4 - Resultado do Teste:
Estatística W observada: 59.8018
Valor Crítico (Chi2, gl=1): 3.8415
P-valor: 1.0436e-14

Conclusão: REJEITAMOS H0. As médias de RTT são significativamente diferentes.
