In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

In [19]:
dataset = "mgh"

In [20]:
df = pd.read_csv(dataset+"_vars.csv")
subjects = df["subject"]

In [4]:
var_names = ["lnm_tot", "lnm_pos", "lnm_neg", "vlsm_tot", "lnm_tot_avg", "lnm_pos_avg", "lnm_neg_avg", "vlsm_tot_avg", "vlsm_pos_avg", "sp_corr_lnm", "sp_corr_vlsm", "l_size"]

### Leverage

In [38]:
csv = pd.DataFrame(columns=var_names)
for i in range(len(var_names)):
    h_subj = ""
    h_val = ""
    m_subj = ""
    m_val = ""
    model = smf.ols(formula= str("nih"+"~"+var_names[i]), data=df).fit()
    influence = model.get_influence()
    leverage = influence.hat_matrix_diag
    for li in range(len(leverage)):
        if leverage[li] > (6/len(subjects)):
            h_subj += str(subjects[li]) + "\n"
            h_val += str(round(leverage[li], 3)) + "\n"
        elif leverage[li] > (4/len(subjects)):
            m_subj += str(subjects[li]) + "\n"
            m_val += str(round(leverage[li], 3)) + "\n"
    csv[var_names[i]] = [m_subj, m_val, h_subj, h_val]
csv.to_csv(dataset+"_leverage.csv")

### Standardized/Studentized Residuals

In [39]:
for i in range(len(var_names)):
    h_subj = ""
    h_val = ""
    m_subj = ""
    m_val = ""
    model = smf.ols(formula= str("nih"+"~"+var_names[i]), data=df).fit()
    influence = model.get_influence()
    stdres = influence.resid_studentized_internal
    for ri in range(len(stdres)):
        if abs(stdres[ri]) > 3:
            h_subj += str(subjects[ri]) + "\n"
            h_val += str(round(stdres[ri], 3)) + "\n"
        elif abs(stdres[ri]) > 2:
            m_subj += str(subjects[ri]) + "\n"
            m_val += str(round(stdres[ri], 3)) + "\n"
    csv[var_names[i]] = [m_subj, m_val, h_subj, h_val]
csv.to_csv(dataset+"_stan_res.csv")

In [40]:
for i in range(len(var_names)):
    h_subj = ""
    h_val = ""
    m_subj = ""
    m_val = ""
    model = smf.ols(formula= str("nih"+"~"+var_names[i]), data=df).fit()
    influence = model.get_influence()
    stdres = influence.resid_studentized_external
    for ri in range(len(stdres)):
        if abs(stdres[ri]) > 3:
            h_subj += str(subjects[ri]) + "\n"
            h_val += str(round(stdres[ri], 3)) + "\n"
        elif abs(stdres[ri]) > 2:
            m_subj += str(subjects[ri]) + "\n"
            m_val += str(round(stdres[ri], 3)) + "\n"
    csv[var_names[i]] = [m_subj, m_val, h_subj, h_val]
csv.to_csv(dataset+"_stud_res.csv")

### Cook's Distance/Influence

In [40]:
csv = pd.DataFrame(columns=var_names)
for i in range(len(var_names)):
    h_subj = ""
    h_val = ""
    m_subj = ""
    m_val = ""
    model = smf.ols(formula= str("nih"+"~"+var_names[i]), data=df).fit()
    influence = model.get_influence()
    cooks_d = influence.cooks_distance
    for di in range(len(cooks_d[0])):
        if cooks_d[0][di] > 1:
            h_subj += str(subjects[di]) + "\n"
            h_val += str(round(cooks_d[0][di], 3)) + "\n"
        elif cooks_d[0][di] > 0.5:
            m_subj += str(subjects[di]) + "\n"
            m_val += str(round(cooks_d[0][di], 3)) + "\n"
    csv[var_names[i]] = [m_subj, m_val, h_subj, h_val]
csv.to_csv(dataset+"_cooks_d.csv")

### Remove Unusual Leverage

In [21]:
lev = pd.read_csv(dataset+"_leverage.csv")

In [23]:
for v in var_names:
    if "\n" in str(lev.at[2, v]):
        high_lev = lev.at[2, v].split("\n")[:-1]
        pred = df[df["subject"].isin(high_lev) == False]
        model = smf.ols(formula=str("nih~"+v), data=pred).fit()
#         print(v)
#         print(model.summary())
#         print()

In [25]:
for v in var_names:
    if "\n" in str(lev.at[0, v]):
        u_lev = lev.at[0, v].split("\n")[:-1]
        if "\n" in str(lev.at[2, v]):
            u_lev += lev.at[2, v].split("\n")[:-1]
        pred = df[df["subject"].isin(u_lev) == False]
        model = smf.ols(formula=str("nih~"+v), data=pred).fit()
#         print(v)
#         print(model.summary())
#         print()

### Remove Unusual Standardized Residuals

In [26]:
stan = pd.read_csv(dataset+"_stan_res.csv")

In [28]:
for v in var_names:
    if "\n" in str(stan.at[2, v]):
        high_stan = stan.at[2, v].split("\n")[:-1]
        pred = df[df["subject"].isin(high_stan) == False]
        model = smf.ols(formula=str("nih~"+v), data=pred).fit()
#         print(v)
#         print(model.summary())
#         print()

In [30]:
for v in var_names:
    if "\n" in str(stan.at[0, v]):
        u_stan = stan.at[0, v].split("\n")[:-1]
        if "\n" in str(stan.at[2, v]):
            u_stan += stan.at[2, v].split("\n")[:-1]
        pred = df[df["subject"].isin(u_stan) == False]
        model = smf.ols(formula=str("nih~"+v), data=pred).fit()
#         print(v)
#         print(model.summary())
#         print()

### Remove Unusual Studentized Residuals

In [31]:
stud = pd.read_csv(dataset+"_stud_res.csv")

In [33]:
for v in var_names:
    if "\n" in str(stud.at[2, v]):
        high_stud = stud.at[2, v].split("\n")[:-1]
        pred = df[df["subject"].isin(high_stud) == False]
        model = smf.ols(formula=str("nih~"+v), data=pred).fit()
#         print(v)
#         print(model.summary())
#         print()

In [35]:
for v in var_names:
    if "\n" in str(stud.at[0, v]):
        u_stud = stud.at[0, v].split("\n")[:-1]
        if "\n" in str(stud.at[2, v]):
            u_stud += stud.at[2, v].split("\n")[:-1]
        pred = df[df["subject"].isin(u_stud) == False]
        model = smf.ols(formula=str("nih~"+v), data=pred).fit()
#         print(v)
#         print(model.summary())
#         print()

### Remove Unusual Cook's Distance

In [36]:
cook = pd.read_csv(dataset+"_cooks_d.csv")

In [37]:
for v in var_names:
    if "\n" in str(cook.at[2, v]):
        high_cook = cook.at[2, v].split("\n")[:-1]
        pred = df[df["subject"].isin(high_cook) == False]
        model = smf.ols(formula=str("nih~"+v), data=pred).fit()
        print(v)
        print(model.summary())
        print()

In [38]:
for v in var_names:
    if "\n" in str(cook.at[0, v]):
        u_cook = cook.at[0, v].split("\n")[:-1]
        if "\n" in str(cook.at[2, v]):
            u_cook += cook.at[2, v].split("\n")[:-1]
        pred = df[df["subject"].isin(u_cook) == False]
        model = smf.ols(formula=str("nih~"+v), data=pred).fit()
        print(v)
        print(model.summary())
        print()