In [1]:
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as ss

In [2]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

In [3]:
raw = pd.read_csv("FCS_Demographics_and_Behavior_MDF.csv")

In [49]:
dataset = "year"

In [50]:
pred = pd.read_csv(dataset+"_vars.csv")

In [51]:
subjects = pred["subject"].to_list()

In [52]:
all_demo = raw[raw["redcap_event_name"] == "basic_subject_info_arm_1"]
all_demo = all_demo[["study_id", "age", "ethnicity", "race", "gender", "education", "handed", "smoke"]]
all_demo = all_demo[all_demo["study_id"].isin(subjects)]

In [53]:
pred["age"] = all_demo["age"].to_list()
pred["ethnicity"] = all_demo["ethnicity"].to_list()
pred["race"] = all_demo["race"].to_list()
pred["gender"] = all_demo["gender"].to_list()
pred["education"] = all_demo["education"].to_list()
pred["handed"] = all_demo["handed"].to_list()
pred["smoke"] = all_demo["smoke"].to_list()

In [9]:
var_names = ["lnm_tot", "lnm_pos", "lnm_neg", "vlsm_tot", "wm_total", "lnm_tot_avg", "lnm_pos_avg", "lnm_neg_avg", "vlsm_tot_avg", "vlsm_pos_avg", "wm_avg", "sp_corr_lnm", "sp_corr_vlsm", "sp_corr_wm", "l_size"]

### Age

In [55]:
# plt.scatter(np.array(pred["age"]), np.array(pred["nih"]))
# plt.xlabel("Age (years)")
# plt.ylabel("NIHSS")

In [57]:
model = smf.ols(formula= str("nih~age"), data=pred).fit()
# print(model.summary())

In [59]:
for i in range(len(var_names)):
    model = smf.ols(formula= str("nih~age+"+var_names[i]), data=pred).fit()
#     print(var_names[i])
#     print(model.summary())

### Gender

In [60]:
females = pred[pred["gender"] == 1]
females.shape[0]

39

In [153]:
for i in range(len(var_names)):
#     print(var_names[i])
    model = smf.ols(formula= str("nih~"+var_names[i]), data=females).fit()
#     print(model.summary())

In [61]:
males = pred[pred["gender"] == 0]
males.shape[0]

49

In [156]:
for i in range(len(var_names)):
#     print(var_names[i])
    model = smf.ols(formula= str("nih~"+var_names[i]), data=males).fit()
#     print(model.summary())

In [63]:
t = ss.ttest_ind(np.array(females["nih"]), np.array(males["nih"]), equal_var=False)
# print("nih")
# print(t)
# print()
for i in range(len(var_names)):
    t = ss.ttest_ind(np.array(females[var_names[i]]), np.array(males[var_names[i]]), equal_var=False)
#     print(var_names[i])
#     print(t)
#     print()

### Smoking

In [64]:
smokers = pred[pred["smoke"] == 1]
smokers.shape[0]

40

In [163]:
for i in range(len(var_names)):
#     print(var_names[i])
    model = smf.ols(formula= str("nih~"+var_names[i]), data=smokers).fit()
#     print(model.summary())

In [65]:
nonsmokers = pred[pred["smoke"] == 0]
nonsmokers.shape[0]

48

In [166]:
for i in range(len(var_names)):
#     print(var_names[i])
    model = smf.ols(formula= str("nih~"+var_names[i]), data=nonsmokers).fit()
#     print(model.summary())

In [67]:
t = ss.ttest_ind(np.array(smokers["nih"]), np.array(nonsmokers["nih"]), equal_var=False)
# print("nih")
# print(t)
# print()
for i in range(len(var_names)):
    t = ss.ttest_ind(np.array(smokers[var_names[i]]), np.array(nonsmokers[var_names[i]]), equal_var=False)
#     print(var_names[i])
#     print(t)
#     print()

### Hand

In [13]:
left = pred[pred["handed"] == 0]

In [12]:
for i in range(len(var_names)):
#     print(var_names[i])
    model = smf.ols(formula= str("nih~"+var_names[i]), data=left).fit()
#     print(model.summary())

In [14]:
right = pred[pred["handed"] == 1]

In [15]:
for i in range(len(var_names)):
#     print(var_names[i])
    model = smf.ols(formula= str("nih~"+var_names[i]), data=right).fit()
#     print(model.summary())

### Ethnicity

In [170]:
hl = pred[pred["ethnicity"] == 0]

In [None]:
for i in range(len(var_names)):
#     print(var_names[i])
    model = smf.ols(formula= str("nih~"+var_names[i]), data=hl).fit()
#     print(model.summary())

In [None]:
nhl = pred[pred["ethnicity"] == 1]

In [None]:
for i in range(len(var_names)):
#     print(var_names[i])
    model = smf.ols(formula= str("nih~"+var_names[i]), data=nhl).fit()
#     print(model.summary())

### Race

In [171]:
ai_an = pred[pred["race"] == 0]
ai_an

Unnamed: 0,nih,lnm_tot,lnm_pos,lnm_neg,vlsm_tot,vlsm_pos,sp_corr_lnm,sp_corr_vlsm,lnm_tot_avg,lnm_pos_avg,lnm_neg_avg,vlsm_tot_avg,vlsm_pos_avg,l_size,subject,wm_total,wm_avg,sp_corr_wm,age,ethnicity,race,gender,education,handed,smoke


In [172]:
asi = pred[pred["race"] == 1]
asi

Unnamed: 0,nih,lnm_tot,lnm_pos,lnm_neg,vlsm_tot,vlsm_pos,sp_corr_lnm,sp_corr_vlsm,lnm_tot_avg,lnm_pos_avg,lnm_neg_avg,vlsm_tot_avg,vlsm_pos_avg,l_size,subject,wm_total,wm_avg,sp_corr_wm,age,ethnicity,race,gender,education,handed,smoke


In [173]:
nh_pi = pred[pred["race"] == 2]
nh_pi

Unnamed: 0,nih,lnm_tot,lnm_pos,lnm_neg,vlsm_tot,vlsm_pos,sp_corr_lnm,sp_corr_vlsm,lnm_tot_avg,lnm_pos_avg,lnm_neg_avg,vlsm_tot_avg,vlsm_pos_avg,l_size,subject,wm_total,wm_avg,sp_corr_wm,age,ethnicity,race,gender,education,handed,smoke


In [68]:
b_aa = pred[pred["race"] == 3]
b_aa.shape[0]

60

In [126]:
for i in range(len(var_names)):
#     print(var_names[i])
    model = smf.ols(formula= str("nih~"+var_names[i]), data=b_aa).fit()
#     print(model.summary())

In [69]:
wh = pred[pred["race"] == 4]
wh.shape[0]

28

In [129]:
for i in range(len(var_names)):
#     print(var_names[i])
    model = smf.ols(formula= str("nih~"+var_names[i]), data=wh).fit()
#     print(model.summary())

In [71]:
t = ss.ttest_ind(np.array(b_aa["nih"]), np.array(wh["nih"]), equal_var=False)
# print("nih")
# print(t)
# print()
for i in range(len(var_names)):
    t = ss.ttest_ind(np.array(b_aa[var_names[i]]), np.array(wh[var_names[i]]), equal_var=False)
#     print(var_names[i])
#     print(t)
#     print()

### Education

In [72]:
hs_less = pred[pred["education"] <= 12]
hs_less.shape[0]

45

In [183]:
for i in range(len(var_names)):
#     print(var_names[i])
    model = smf.ols(formula= str("nih~"+var_names[i]), data=hs_less).fit()
#     print(model.summary())

In [73]:
more_hs = pred[pred["education"] > 12]
more_hs.shape[0]

43

In [186]:
for i in range(len(var_names)):
#     print(var_names[i])
    model = smf.ols(formula= str("nih~"+var_names[i]), data=more_hs).fit()
#     print(model.summary())

In [76]:
t = ss.ttest_ind(np.array(hs_less["nih"]), np.array(more_hs["nih"]), equal_var=False)
# print("nih")
# print(t)
# print()
for i in range(len(var_names)):
    t = ss.ttest_ind(np.array(hs_less[var_names[i]]), np.array(more_hs[var_names[i]]), equal_var=False)
#     print(var_names[i])
#     print(t)
#     print()