In [40]:
import pandas as pd
import scipy.stats
from statsmodels.multivariate.manova import MANOVA
import numpy as np


from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error,r2_score

In [2]:
df = pd.read_csv("Removed_outliers_byGroup_data.csv")

# Assigning the groups
df['group'] = df.apply(lambda row: f"{int(row['HINPOV'])},{int(row['PENINC'])},{int(row['HIGOV'])},{int(row['RETMON'])},{int(row['SLFEMP'])}", axis=1)
groups = df['group'].unique()
print(groups)
continuous_features = ['BMI', 'INHPFN',  'HHHRES', 'HCHILD','LIVSIB',  'HAIRA', 'HATOTB', 'IEARN','HITOT', 'PRPCNT']
cate_features = ['HINPOV','PENINC', 'HIGOV','RETMON', 'SLFEMP']
target = ["SHLT","MSTOT","COGTOT"]

# Dropping the categorical features
df = df.drop(cate_features, axis = 1)
df

['0,0,0,0,0' '0,0,0,0,1' '0,0,1,0,0' '0,0,1,0,1' '0,0,1,1,0' '0,0,1,1,1'
 '0,1,1,0,0' '0,1,1,1,0' '0,1,1,1,1' '1,0,0,0,0']


Unnamed: 0,SHLT,BMI,MSTOT,COGTOT,INHPFN,HHHRES,HCHILD,LIVSIB,HAIRA,HATOTB,IEARN,HITOT,PRPCNT,group
0,5.0,33.0,14.0,17.0,0.0,2.0,4.0,0.0,0.0,0.0,20000.0,22400.0,0.0,00000
1,4.0,23.8,8.0,14.0,0.0,2.0,6.0,2.0,0.0,15000.0,25000.0,107000.0,1.0,00000
2,3.0,26.0,15.0,27.0,0.0,2.0,2.0,1.0,40000.0,290000.0,103000.0,134384.0,1.0,00000
3,4.0,40.7,11.0,16.0,0.0,3.0,4.0,7.0,0.0,16477.0,62000.0,72157.0,0.0,00000
4,3.0,22.8,15.0,31.0,0.0,4.0,4.0,4.0,4000.0,138300.0,15000.0,95660.0,1.0,00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33533,4.0,39.6,14.0,23.0,0.0,2.0,5.0,6.0,0.0,27500.0,27000.0,29112.0,0.0,10000
33534,1.0,18.4,14.0,27.0,0.0,5.0,6.0,5.0,0.0,90000.0,0.0,0.0,1.0,10000
33535,1.0,26.9,9.0,25.0,0.0,2.0,7.0,7.0,0.0,2341.0,3295.0,3295.0,1.0,10000
33536,4.0,29.3,13.0,23.0,0.0,3.0,3.0,7.0,0.0,0.0,18000.0,18000.0,0.0,10000


In [9]:
# Print the correlation among the target variables for each group
for group in groups:
    print(group)
    print(df[df["group"]==group][target].corr())

    correlation, p_value = scipy.stats.pearsonr(df['SHLT'], df['MSTOT'])
    print(f'\nPearson correlation between SHLT and MSTOT: {correlation}, P-value: {p_value}')

    correlation, p_value = scipy.stats.pearsonr(df['SHLT'], df['COGTOT'])
    print(f'Pearson correlation between SHLT and COGTOT: {correlation}, P-value: {p_value}')

    correlation, p_value = scipy.stats.pearsonr(df['COGTOT'], df['MSTOT'])
    print(f'Pearson correlation between MSTOT and COGTOT: {correlation}, P-value: {p_value}\n')


0,0,0,0,0
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.167408 -0.189673
MSTOT  -0.167408  1.000000  0.668407
COGTOT -0.189673  0.668407  1.000000

Pearson correlation between SHLT and MSTOT: -0.14602879933827834, P-value: 3.1975319883446005e-159
Pearson correlation between SHLT and COGTOT: -0.18897019698202974, P-value: 3.618519293858185e-267
Pearson correlation between MSTOT and COGTOT: 0.6382783707627973, P-value: 0.0

0,0,0,0,1
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.168911 -0.189017
MSTOT  -0.168911  1.000000  0.685551
COGTOT -0.189017  0.685551  1.000000

Pearson correlation between SHLT and MSTOT: -0.14602879933827834, P-value: 3.1975319883446005e-159
Pearson correlation between SHLT and COGTOT: -0.18897019698202974, P-value: 3.618519293858185e-267
Pearson correlation between MSTOT and COGTOT: 0.6382783707627973, P-value: 0.0

0,0,1,0,0
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.132131 -0.206216
MSTOT  -0.132131  1.000000  0.666644
COGTO

In [8]:
# Generating the unique combinations of groups
pairwise_combinations = [(groups[i], groups[j]) for i in range(len(groups)) for j in range(i+1, len(groups))]
significant_pairs = []

manova = pd.DataFrame(columns=['Group Pair', "P-Value"])

# Perform MANOVA for each pair of groups
for group1, group2 in pairwise_combinations:
    temp_df = df[df['group'].isin([group1, group2])]
    
    maov = MANOVA.from_formula('SHLT+MSTOT+COGTOT ~ group', data=temp_df)

    result = maov.mv_test()
    
    group_p_value = result.results['group']['stat']['Pr > F'][0]

    current_pair = pd.DataFrame([[f"{group1} and {group2}", group_p_value]], columns=['Group Pair', "P-Value"])
    manova = pd.concat([manova, current_pair])
    
    if group_p_value > 0.1:
        significant_pairs.append((group1, group2))

print("Significant group pairs:", significant_pairs)
manova.to_csv("MANOVA_results.csv", index=False)
manova


Significant group pairs: [('0,0,1,0,1', '0,1,1,0,0'), ('0,0,1,0,1', '0,1,1,1,0'), ('0,1,1,0,0', '0,1,1,1,0')]


Unnamed: 0,Group Pair,P-Value
0,"0,0,0,0,0 and 0,0,0,0,1",1.012832e-12
0,"0,0,0,0,0 and 0,0,1,0,0",1.15092e-36
0,"0,0,0,0,0 and 0,0,1,0,1",7.519252e-73
0,"0,0,0,0,0 and 0,0,1,1,0",2.080887e-81
0,"0,0,0,0,0 and 0,0,1,1,1",7.991716e-71
0,"0,0,0,0,0 and 0,1,1,0,0",5.512407e-20
0,"0,0,0,0,0 and 0,1,1,1,0",6.441421e-61
0,"0,0,0,0,0 and 0,1,1,1,1",4.4169e-52
0,"0,0,0,0,0 and 1,0,0,0,0",1.498657e-109
0,"0,0,0,0,1 and 0,0,1,0,0",7.151693e-39


In [5]:
df2 = df.copy()
df2["group"] = df2["group"].str.replace('0,0,1,0,1' , "combin")
df2["group"] = df2["group"].str.replace('0,1,1,0,0' , "combin")
df2["group"] = df2["group"].str.replace('0,1,1,1,0' , "combin")
df2["group"] = df2["group"].str.replace('combin' , "0,0,1,0,1 + 0,1,1,0,0 + 0,1,1,1,0")
new_groups = df2["group"].unique()
print(groups)

# Print the correlation among the target variables for each group
for group in new_groups:
    print(group)
    print(df2[df2["group"]==group][target].corr(), "\n")

['0,0,0,0,0' '0,0,0,0,1' '0,0,1,0,0' '0,0,1,0,1' '0,0,1,1,0' '0,0,1,1,1'
 '0,1,1,0,0' '0,1,1,1,0' '0,1,1,1,1' '1,0,0,0,0']
0,0,0,0,0
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.167408 -0.189673
MSTOT  -0.167408  1.000000  0.668407
COGTOT -0.189673  0.668407  1.000000 

0,0,0,0,1
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.168911 -0.189017
MSTOT  -0.168911  1.000000  0.685551
COGTOT -0.189017  0.685551  1.000000 

0,0,1,0,0
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.132131 -0.206216
MSTOT  -0.132131  1.000000  0.666644
COGTOT -0.206216  0.666644  1.000000 

0,0,1,0,1 + 0,1,1,0,0 + 0,1,1,1,0
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.101293 -0.171701
MSTOT  -0.101293  1.000000  0.589529
COGTOT -0.171701  0.589529  1.000000 

0,0,1,1,0
            SHLT     MSTOT    COGTOT
SHLT    1.000000 -0.114583 -0.178451
MSTOT  -0.114583  1.000000  0.620651
COGTOT -0.178451  0.620651  1.000000 

0,0,1,1,1
            SHLT     MSTOT    COGTOT
SHLT  

In [9]:
print(df2.groupby("group").size())
print("")
print(df.groupby("group").size())

group
0,0,0,0,0                            11859
0,0,0,0,1                             1969
0,0,1,0,0                             4554
0,0,1,0,1 + 0,1,1,0,0 + 0,1,1,1,0     6138
0,0,1,1,0                             4067
0,0,1,1,1                             2967
0,1,1,1,1                             1418
1,0,0,0,0                              566
dtype: int64

group
0,0,0,0,0    11859
0,0,0,0,1     1969
0,0,1,0,0     4554
0,0,1,0,1     2532
0,0,1,1,0     4067
0,0,1,1,1     2967
0,1,1,0,0      883
0,1,1,1,0     2723
0,1,1,1,1     1418
1,0,0,0,0      566
dtype: int64


In [10]:
# df2.to_csv("ThreeForthSemester.csv", index=False)

In [37]:
df = pd.read_csv("ThreeForthSemester.csv")

trans = make_column_transformer((StandardScaler(), continuous_features),
                                remainder="passthrough")

df2 = pd.DataFrame( trans.fit_transform( df ) ,
                   columns=trans.get_feature_names_out( ))\
                    .rename(columns = {f"standardscaler__{col}":col for col in [*continuous_features]})\
                    .rename({"remainder__SHLT" : "SHLT", "remainder__MSTOT" : "MSTOT", "remainder__COGTOT" : "COGTOT", "remainder__group" : "group"}, axis=1)

df2

# df2.to_csv("ThreeForthSemester_StandardExceptTarget.csv", index=False)

Unnamed: 0,BMI,INHPFN,HHHRES,HCHILD,LIVSIB,HAIRA,HATOTB,IEARN,HITOT,PRPCNT,SHLT,MSTOT,COGTOT,group
0,0.920647,-0.126169,-0.600292,0.426414,-1.217448,-0.418911,-0.59898,-0.227751,-0.860736,-1.34082,5.0,14.0,17.0,00000
1,-0.851875,-0.126169,-0.600292,1.504356,-0.375403,-0.418911,-0.579889,-0.090159,0.189366,0.358474,4.0,8.0,14.0,00000
2,-0.428011,-0.126169,-0.600292,-0.651528,-0.796425,-0.152079,-0.229887,2.056276,0.529271,0.358474,3.0,15.0,27.0,00000
3,2.404172,-0.126169,0.366366,0.426414,1.72971,-0.418911,-0.578009,0.928022,-0.243124,-1.34082,4.0,11.0,16.0,00000
4,-1.044541,-0.126169,1.333024,0.426414,0.466642,-0.392228,-0.422961,-0.365343,0.048608,0.358474,3.0,15.0,31.0,00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33533,2.19224,-0.126169,-0.600292,0.965385,1.308688,-0.418911,-0.563979,-0.035122,-0.777423,-1.34082,4.0,14.0,23.0,10000
33534,-1.892269,-0.126169,2.299682,1.504356,0.887665,-0.418911,-0.484434,-0.778119,-1.138777,0.358474,1.0,14.0,27.0,10000
33535,-0.254612,-0.126169,-0.600292,2.043327,1.72971,-0.418911,-0.596,-0.687446,-1.097877,0.358474,1.0,9.0,25.0,10000
33536,0.207785,-0.126169,0.366366,-0.112557,1.72971,-0.418911,-0.59898,-0.282788,-0.915351,-1.34082,4.0,13.0,23.0,10000


In [43]:
groups = df2['group'].unique()

for group in groups:
    rf = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=300)
    X = df2[df2["group"] == group].drop(["group"]+target, axis=1)
    y = df2[df2["group"] == group][target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print(f"Group: {group}")
    for target_var in target:
        print(f"Target: {target_var}")
        print("==> R2:", r2_score(y_test[target_var], y_pred[:, target.index(target_var)]))
        print("==> MAPE:", mean_absolute_percentage_error(y_test[target_var], y_pred[:, target.index(target_var)]))
    
    print("\n=======================================================")


Group: 0,0,0,0,0
Target: SHLT
==> R2: 0.7854814431722511
==> MAPE: 0.17370592092936105
Target: MSTOT
==> R2: 0.7550975275226096
==> MAPE: 0.04911323708456929
Target: COGTOT
==> R2: 0.7828317955694768
==> MAPE: 0.05756167484980132

Group: 0,0,0,0,1
Target: SHLT
==> R2: 0.7997707490746868
==> MAPE: 0.1751133671742809
Target: MSTOT
==> R2: 0.7779799673011455
==> MAPE: 0.044283229460894434
Target: COGTOT
==> R2: 0.7558656495808715
==> MAPE: 0.05201826531069649

Group: 0,0,1,0,0
Target: SHLT
==> R2: 0.7745538731144631
==> MAPE: 0.17442312378167643
Target: MSTOT
==> R2: 0.7832717572655462
==> MAPE: 0.05589212929289684
Target: COGTOT
==> R2: 0.7407057092568239
==> MAPE: 0.07020742657713767

Group: 0,0,1,0,1 + 0,1,1,0,0 + 0,1,1,1,0
Target: SHLT
==> R2: 0.7499965591688423
==> MAPE: 0.17788653637350704
Target: MSTOT
==> R2: 0.7774237475230213
==> MAPE: 0.04013036660227214
Target: COGTOT
==> R2: 0.7642800186399836
==> MAPE: 0.055775389104250936

Group: 0,0,1,1,0
Target: SHLT
==> R2: 0.76708374269