In [37]:
import pandas as pd
import scipy.stats
from statsmodels.multivariate.manova import MANOVA
import numpy as np


from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

In [38]:
df = pd.read_csv("Removed_outliers_byGroup_data.csv")

# Assigning the groups
df['group'] = df.apply(lambda row: f"{int(row['HINPOV'])},{int(row['PENINC'])},{int(row['HIGOV'])},{int(row['RETMON'])},{int(row['SLFEMP'])}", axis=1)
groups = df['group'].unique()
print(groups)
continuous_features = ['BMI', 'INHPFN',  'HHHRES', 'HCHILD','LIVSIB',  'HAIRA', 'HATOTB', 'IEARN','HITOT', 'PRPCNT']
cate_features = ['HINPOV','PENINC', 'HIGOV','RETMON', 'SLFEMP']
target = ["SHLT","MSTOT","COGTOT"]

# Dropping the categorical features
df = df.drop(cate_features, axis = 1)
df

['0,0,0,0,0' '0,0,0,0,1' '0,0,1,0,0' '0,0,1,0,1' '0,0,1,1,0' '0,0,1,1,1'
 '0,1,1,0,0' '0,1,1,1,0' '0,1,1,1,1' '1,0,0,0,0']


Unnamed: 0,SHLT,BMI,MSTOT,COGTOT,INHPFN,HHHRES,HCHILD,LIVSIB,HAIRA,HATOTB,IEARN,HITOT,PRPCNT,group
0,5.0,33.0,14.0,17.0,0.0,2.0,4.0,0.0,0.0,0.0,20000.0,22400.0,0.0,00000
1,4.0,23.8,8.0,14.0,0.0,2.0,6.0,2.0,0.0,15000.0,25000.0,107000.0,1.0,00000
2,3.0,26.0,15.0,27.0,0.0,2.0,2.0,1.0,40000.0,290000.0,103000.0,134384.0,1.0,00000
3,4.0,40.7,11.0,16.0,0.0,3.0,4.0,7.0,0.0,16477.0,62000.0,72157.0,0.0,00000
4,3.0,22.8,15.0,31.0,0.0,4.0,4.0,4.0,4000.0,138300.0,15000.0,95660.0,1.0,00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33533,4.0,39.6,14.0,23.0,0.0,2.0,5.0,6.0,0.0,27500.0,27000.0,29112.0,0.0,10000
33534,1.0,18.4,14.0,27.0,0.0,5.0,6.0,5.0,0.0,90000.0,0.0,0.0,1.0,10000
33535,1.0,26.9,9.0,25.0,0.0,2.0,7.0,7.0,0.0,2341.0,3295.0,3295.0,1.0,10000
33536,4.0,29.3,13.0,23.0,0.0,3.0,3.0,7.0,0.0,0.0,18000.0,18000.0,0.0,10000


In [48]:
# Print the correlation among the target variables for each group
for group in groups:
    print(group)
    print(df[df["group"]==group][target].corr())

    correlation, p_value = scipy.stats.pearsonr(df['SHLT'], df['MSTOT'])
    print(f'\nPearson correlation between SHLT and MSTOT: {correlation}, P-value: {p_value}')

    correlation, p_value = scipy.stats.pearsonr(df['SHLT'], df['COGTOT'])
    print(f'Pearson correlation between SHLT and COGTOT: {correlation}, P-value: {p_value}')

    correlation, p_value = scipy.stats.pearsonr(df['COGTOT'], df['MSTOT'])
    print(f'Pearson correlation between MSTOT and COGTOT: {correlation}, P-value: {p_value}\n')


0,0,0,0,0
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.167408 -0.189673
MSTOT  -0.167408  1.000000  0.668407
COGTOT -0.189673  0.668407  1.000000

Pearson correlation between SHLT and MSTOT: -0.14602879933827834, P-value: 3.1975319883446005e-159
Pearson correlation between SHLT and COGTOT: -0.18897019698202974, P-value: 3.618519293858185e-267
Pearson correlation between MSTOT and COGTOT: 0.6382783707627973, P-value: 0.0

0,0,0,0,1
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.168911 -0.189017
MSTOT  -0.168911  1.000000  0.685551
COGTOT -0.189017  0.685551  1.000000

Pearson correlation between SHLT and MSTOT: -0.14602879933827834, P-value: 3.1975319883446005e-159
Pearson correlation between SHLT and COGTOT: -0.18897019698202974, P-value: 3.618519293858185e-267
Pearson correlation between MSTOT and COGTOT: 0.6382783707627973, P-value: 0.0

0,0,1,0,0
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.132131 -0.206216
MSTOT  -0.132131  1.000000  0.666644
COGTO

In [4]:
# Generating the unique combinations of groups
pairwise_combinations = [(groups[i], groups[j]) for i in range(len(groups)) for j in range(i+1, len(groups))]
significant_pairs = []

# Perform MANOVA for each pair of groups
for group1, group2 in pairwise_combinations:
    temp_df = df[df['group'].isin([group1, group2])]
    
    maov = MANOVA.from_formula('SHLT+MSTOT+COGTOT ~ group', data=temp_df)

    result = maov.mv_test()
    print(f"MANOVA between {group1} and {group2}:")
    
    group_p_value = result.results['group']['stat']['Pr > F'][0]
    print(f"P-value: {group_p_value}\n")
    
    if group_p_value > 0.1:
        significant_pairs.append((group1, group2))

print("Significant group pairs:", significant_pairs)


MANOVA between 0,0,0,0,0 and 0,0,0,0,1:
P-value: 1.0128320048164901e-12

MANOVA between 0,0,0,0,0 and 0,0,1,0,0:
P-value: 1.1509202714085733e-36

MANOVA between 0,0,0,0,0 and 0,0,1,0,1:
P-value: 7.519252017116213e-73

MANOVA between 0,0,0,0,0 and 0,0,1,1,0:
P-value: 2.0808869823462375e-81

MANOVA between 0,0,0,0,0 and 0,0,1,1,1:
P-value: 7.991715566316587e-71

MANOVA between 0,0,0,0,0 and 0,1,1,0,0:
P-value: 5.512406700693228e-20

MANOVA between 0,0,0,0,0 and 0,1,1,1,0:
P-value: 6.441421145250314e-61

MANOVA between 0,0,0,0,0 and 0,1,1,1,1:
P-value: 4.416899659084358e-52

MANOVA between 0,0,0,0,0 and 1,0,0,0,0:
P-value: 1.498656725832206e-109

MANOVA between 0,0,0,0,1 and 0,0,1,0,0:
P-value: 7.151692526131521e-39

MANOVA between 0,0,0,0,1 and 0,0,1,0,1:
P-value: 4.3698417959793267e-44

MANOVA between 0,0,0,0,1 and 0,0,1,1,0:
P-value: 2.0110937805851863e-57

MANOVA between 0,0,0,0,1 and 0,0,1,1,1:
P-value: 2.9064296902981532e-53

MANOVA between 0,0,0,0,1 and 0,1,1,0,0:
P-value: 1.712503

In [5]:
df2 = df.copy()
df2["group"] = df2["group"].str.replace('0,0,1,0,1' , "combin")
df2["group"] = df2["group"].str.replace('0,1,1,0,0' , "combin")
df2["group"] = df2["group"].str.replace('0,1,1,1,0' , "combin")
df2["group"] = df2["group"].str.replace('combin' , "0,0,1,0,1 + 0,1,1,0,0 + 0,1,1,1,0")
new_groups = df2["group"].unique()
print(groups)

# Print the correlation among the target variables for each group
for group in new_groups:
    print(group)
    print(df2[df2["group"]==group][target].corr(), "\n")

['0,0,0,0,0' '0,0,0,0,1' '0,0,1,0,0' '0,0,1,0,1' '0,0,1,1,0' '0,0,1,1,1'
 '0,1,1,0,0' '0,1,1,1,0' '0,1,1,1,1' '1,0,0,0,0']
0,0,0,0,0
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.167408 -0.189673
MSTOT  -0.167408  1.000000  0.668407
COGTOT -0.189673  0.668407  1.000000 

0,0,0,0,1
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.168911 -0.189017
MSTOT  -0.168911  1.000000  0.685551
COGTOT -0.189017  0.685551  1.000000 

0,0,1,0,0
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.132131 -0.206216
MSTOT  -0.132131  1.000000  0.666644
COGTOT -0.206216  0.666644  1.000000 

0,0,1,0,1 + 0,1,1,0,0 + 0,1,1,1,0
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.101293 -0.171701
MSTOT  -0.101293  1.000000  0.589529
COGTOT -0.171701  0.589529  1.000000 

0,0,1,1,0
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.114583 -0.178451
MSTOT  -0.114583  1.000000  0.620651
COGTOT -0.178451  0.620651  1.000000 

0,0,1,1,1
            SHLT     MSTOT    COGTOT
SHLT  

In [9]:
print(df2.groupby("group").size())
print("")
print(df.groupby("group").size())

group
0,0,0,0,0                            11859
0,0,0,0,1                             1969
0,0,1,0,0                             4554
0,0,1,0,1 + 0,1,1,0,0 + 0,1,1,1,0     6138
0,0,1,1,0                             4067
0,0,1,1,1                             2967
0,1,1,1,1                             1418
1,0,0,0,0                              566
dtype: int64

group
0,0,0,0,0    11859
0,0,0,0,1     1969
0,0,1,0,0     4554
0,0,1,0,1     2532
0,0,1,1,0     4067
0,0,1,1,1     2967
0,1,1,0,0      883
0,1,1,1,0     2723
0,1,1,1,1     1418
1,0,0,0,0      566
dtype: int64


In [10]:
# df2.to_csv("ThreeForthSemester.csv", index=False)

In [49]:
trans = make_column_transformer((StandardScaler(), continuous_features + target),
                                remainder="passthrough")

df2 = pd.DataFrame( trans.fit_transform( df ) , columns=trans.get_feature_names_out( )).rename(columns = {f"standardscaler__{col}":col for col in [*continuous_features, *target]}).rename({"remainder__group":"group"}, axis=1)
# df2.to_csv("ThreeForthSemester_standard.csv", index=False)

# df2["MS+COG"] = (df2["MSTOT"] + df2["COGTOT"])/2
# df2.drop(["MSTOT", "COGTOT"], axis=1, inplace=True)

# df2.to_csv("ThreeForthSemester_standard_MS+COG.csv", index=False)

In [None]:
'''
分target变量进行grid search cv
MS+GOG, SHLT

不分target变量进行grid search cv
MSTOT, COGTOT, SHLT
'''