In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.linear_model import ElasticNetCV

seed = 420
np.random.seed(seed)

In [2]:
def generate_data(dimention:int,data_len:int)->pd.DataFrame:
    variables = np.array(range(p),dtype=float)
    data = pd.DataFrame()
    for idx, variable in enumerate(variables):
        mu, sigma = variable, math.sqrt(variable + 1)
        generated = np.random.normal(mu,sigma,data_len)
        data[variable] = generated
    return data.copy()

def build_response(data:pd.DataFrame,colums_to_include:list,values:list,name = "y")->pd.Series:
    if np.shape(colums_to_include) != np.shape(values):
        raise ValueError('Columns and values have different shapes')
    response = np.random.normal(0,1,np.shape(data)[0])
    for i, column in enumerate(colums_to_include):
        response += data[column]*values[i]
    return pd.Series(name=name,data = response)

Il vero modello è definito come $y = 16\beta_5 + 8\beta_{10} + 4\beta_{15} + 2\beta_{20} + \varepsilon$.

> La $i$-esima variabile è campionata da $N ~ (i,\sqrt{i})$.

In [3]:
p = 21
data_len = 100
toUse = [5,10,15,20]
values = [16,8,4,2]

data = generate_data(dimention = p,
                     data_len = data_len)
y = build_response(data,toUse,values)

In [4]:
rng = np.random.default_rng(seed)

def sample_DataFrame(df:pd.DataFrame,sample_percentage_size:float)->pd.DataFrame:
    """"Funzione per campionare una percentuale dei dati dal DataSet."""
    if not (0.0 < sample_percentage_size < 1):
        raise AttributeError(f"sample_percentage_size must be in (0,1), got {sample_percentage_size} instead.")
    sample_size = int(df.shape[0] * sample_percentage_size)
    sample_idx = rng.integers(low=0,high=df.shape[0],size=sample_size)
    print(sample_idx)
    return df.iloc[sample_idx]

def split_DataFrame(df:pd.DataFrame, sample_percentage_size=0.5)->tuple[pd.DataFrame,pd.DataFrame]:
    """Funzione per dividere il Dataset in 2 parti. Il primo Dataset ha elementi pari alla percentuale fornita."""
    df1 = sample_DataFrame(df,sample_percentage_size)
    df2 = df.iloc[~df1.index]
    print(f"First DataFrame has {df1.shape[0]} items, Second DataFrame has {df2.shape[0]} items.")
    return df1,df2

# Passi della simulazione:

**Ripeti B volte da 1 a 4**

1. Dividere il DataSet in due parti uguali $I_1$ e $I_2$
2. Applicare uno stimatore regolarizzato (LASSO) a $I_1$ ed estrarre i coefficienti diversi da zero
3. Applicare a $I_2$ un OLS con soli i parametri ottenuti da $I_2$
4. Definito $s = |\{\beta \ne 0, \forall \beta \in I_1\}|$ e $P_{raw,j}$ il j-esimo p-value
- Correggere i p-value ottenuti con $P_{corr,j} = min(P_{raw,j}*s,1)$
5. Aggregare i $P_{corr}$ della simulazione utilizzando:
- media
- mediana
- min-max

> Gli intervalli di confidenza possono essere aggregati con gli stessi metodi del punto 5.

La ripetizione (solitamente con $B = 50$ o $B = 100$) serve per avere dei risultati riproducibili indipendentemente dal seme usato.

In [5]:
#TODO: Trovare un modo per separare la y
def applyLasso(X:pd.DataFrame,y:pd.Series)-> tuple[int,float]:
    lasso = ElasticNetCV(cv=5,random_state=seed,l1_ratio = 1)
    lasso.fit(X, y)
    print(f"Best alpha is: {lasso.alpha_}")
    return nonZeroCoeffiecients(X.columns,lasso.coef_)

def nonZeroCoeffiecients(dataColumns,regression_coefficients):
    coefficients = list(zip(dataColumns,regression_coefficients))
    coefficients_copy = coefficients.copy()
    for coefficient in coefficients_copy:
        if np.isclose(coefficient[1],0.0):
            coefficients.remove(coefficient)
    return coefficients

In [6]:
B = 100

I_1, I_2 = split_DataFrame(data)
coefficients = applyLasso(I_1,y)
print(f"There are {len(coefficients)} non zero coefficients:\n {coefficients}")

[61 77 12 94 87 98  1 41 38 64 63 75 62  1 61 49 17 61 22  9 52 35 89 89
  8 23 21 71 79 19 60 43 74 94 21 58 36 10 72 77 80 98 11 41 82 56 43  4
 41 87]
First DataFrame has 50 items, Second DataFrame has 50 items.


ValueError: Found input variables with inconsistent numbers of samples: [50, 100]

# I_2 per OLS


In [None]:
coefficients_name = [coefficient[0] for coefficient in coefficients]

X_I_2 = I_2.drop("y",axis=1)
y_I_2 = I_2["y"]

X_I_2 = X_I_2.loc[:,X_I_2.columns.isin(coefficients_name)]
X_I_2

In [None]:
import statsmodels.api as sm

X_I_2 = sm.add_constant(X_I_2)
least_square = sm.OLS(y_I_2, X_I_2)
results = least_square.fit()
print(results.summary())

In [None]:
results.conf_int(alpha=.05)

In [None]:
results.pvalues

In [None]:
# Aggiustare i pvalue in base al numero di elementi in S(I_1)
results.pvalues * len(coefficients)


In [None]:
print(results.summary())