# ChatGPT Poll Composite Variable Analysis - Feature Seperation 
Modified By: Christian Sarmiento <br>
<br>
Changes:
- Reran ANOVA tests with students and instructors seperated (using EL-recoded dataset and analysis code)

In [31]:
# Imports 
import pandas as pd 
import numpy as np 
import statsmodels.api as sm 
from statsmodels.stats.multicomp import MultiComparison
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read in data 
path = '/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Center for Social Media Research/ChatGPT Research/pollcodeddata.csv'
data = pd.read_csv(path)

In [3]:
# Double checking 
data.head(20)

Unnamed: 0,Q2_AWARENESS,Q4_SOURCE,Q5_REASON_STUDENT,Q6_REASON_PROF,Q71_BENEFITS,Q72_LIMITATIONS,Q73_IMPLICATIONS,Q81_LIKELY_PRODUCTIVE,Q82_PRODUCTIVE_STUDENT,Q83_PRODUCTIVE_PROF,...,LIMITATIONS,IMPLICATIONS,WORK_PRODUCTIVITY,PLAGIARISM,SOCIAL_IMPACT,HIGHERED_BENEFIT,USAGE,RESP_USE_TRUST_BENEF,ACCURACY,TRUST
0,1,From the news,1,2,4,4,4,3,3,3,...,2.17173,2.373974,0.705744,-1.463215,0.969679,1.707759,0.895819,1.543423,0.254953,1.750332
1,2,From the news,3,2,1,1,1,1,2,1,...,-1.022405,-0.821896,-0.721571,-0.073755,-0.907521,-1.149544,-0.751619,-0.5493,-0.296804,-0.685842
2,2,From a friend,3,3,1,2,2,1,2,2,...,0.042306,0.243394,-0.587772,-0.628901,0.969679,-1.149544,-0.676335,-0.765636,-1.02581,-0.685842
3,1,From a family member,2,2,1,1,1,2,2,2,...,-1.022405,-0.821896,-0.29775,-0.407348,0.969679,0.279107,-1.315724,-0.708271,-0.10955,-0.685842
4,4,From a friend,4,1,1,2,2,1,3,1,...,0.042306,0.243394,-0.141898,0.798437,-0.907521,-1.149544,0.895819,-0.631084,0.0677,-1.12316
5,2,From the news,1,2,1,2,1,1,2,1,...,0.042306,-0.821896,-0.896927,0.445107,-0.907521,0.279107,-1.018573,-0.952564,0.432203,-0.311102
6,1,From the news,1,3,2,2,2,2,4,2,...,0.042306,0.243394,0.686241,-0.630496,-0.907521,-1.149544,0.895819,0.764724,-0.296804,-0.248524
7,1,From a family member,2,3,1,4,2,2,2,2,...,2.17173,0.243394,-0.122395,0.742417,0.969679,0.279107,-0.394283,-0.06134,-1.02581,-0.685842
8,4,,4,1,2,1,3,1,2,1,...,-1.022405,1.308684,-0.896927,-0.091897,-0.907521,-1.149544,0.895819,-0.212884,-1.02581,-1.060582
9,4,From this questionnaire,4,4,1,3,3,1,1,1,...,1.107018,1.308684,-1.301245,0.205413,-0.907521,-1.149544,0.895819,-0.777972,0.80671,-1.4979


In [34]:
#data.columns

In [25]:
# (EL Code) Recode Age Ranges (Age ranges were in original format for some reason, recoding for consistency)
def recode_age_range(age_range):
    if age_range in ['18-21']:
        return '18-21'
    elif age_range in ['22-25', '26-29']:
        return '22-29'
    elif age_range in ['30-33', '34-37', '38-41']:
        return '30-41'
    elif age_range in ['42-45']:
        return '42-45'
    elif age_range in ['46+']:
        return '46+'
    else:
        return age_range

# Apply the function to the AGE_RANGE column
data['AGE_RANGE'] = data['AGE_RANGE'].apply(recode_age_range)

In [26]:
# Make seperate dfs for students and instructors 
students = data[data['OCCUPATION'] == 'Student']
instructors = data[data['OCCUPATION'] == 'Instructor']

# Analysis of Variance - One-Way ANOVA (Students & Instructors)

In [28]:
# (EL Code) One-Way Anova for Students 
variables = ['AWARENESS', 'BENEFITS', 'LIMITATIONS', 'IMPLICATIONS',
             'WORK_PRODUCTIVITY', 'PLAGIARISM', 'SOCIAL_IMPACT',
             'HIGHERED_BENEFIT', 'USAGE', 'RESP_USE_TRUST_BENEF', 'TRUST']
factors = ['AGE_RANGE' , 'GENDER']

for factor_name in factors:
    for target_name in variables:
        
        # One-way ANOVA
        formula = f"{target_name} ~ C({factor_name})"
        print(formula)
        
        # Perform the ANOVA
        model = sm.formula.ols(formula, data=students).fit()
        anova_table = sm.stats.anova_lm(model, typ=2)

        # Perform Tukey's HSD test
        mc = MultiComparison(students[target_name], students[factor_name])
        tukey_result = mc.tukeyhsd()

        print("ANOVA Results:")
        print(anova_table)

        print("Tukey's HSD Test Results:")
        print(tukey_result)

AWARENESS ~ C(AGE_RANGE)
ANOVA Results:
                  sum_sq     df         F    PR(>F)
C(AGE_RANGE)    8.065338    4.0  2.229021  0.067211
Residual      180.916603  200.0       NaN       NaN
Tukey's HSD Test Results:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
 18-21  22-29   0.2381 0.6464 -0.2397  0.716  False
 18-21  30-41  -0.0989 0.9996 -1.4244 1.2266  False
 18-21  42-45  -0.6411 0.9622 -3.2675 1.9853  False
 18-21    46+   1.2566 0.0724 -0.0689 2.5821  False
 22-29  30-41   -0.337  0.962 -1.7151  1.041  False
 22-29  42-45  -0.8792  0.892 -3.5326 1.7741  False
 22-29    46+   1.0184 0.2534 -0.3596 2.3965  False
 30-41  42-45  -0.5422 0.9863 -3.4694  2.385  False
 30-41    46+   1.3555 0.2623 -0.4959 3.2068  False
 42-45    46+   1.8977 0.3854 -1.0296 4.8249  False
---------------------------------------------------
BENEFITS ~ C(AGE_RANGE)
ANOVA Results:
           

ANOVA Results:
                  sum_sq     df         F    PR(>F)
C(AGE_RANGE)    3.057636    4.0  1.206057  0.309456
Residual      126.761705  200.0       NaN       NaN
Tukey's HSD Test Results:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
 18-21  22-29    0.071 0.9884  -0.329  0.471  False
 18-21  30-41   0.2682 0.9635 -0.8412 1.3777  False
 18-21  42-45  -0.2647 0.9974 -2.4632 1.9337  False
 18-21    46+   0.8269 0.2455 -0.2826 1.9364  False
 22-29  30-41   0.1973 0.9899 -0.9562 1.3508  False
 22-29  42-45  -0.3357 0.9937 -2.5567 1.8853  False
 22-29    46+   0.7559 0.3741 -0.3976 1.9094  False
 30-41  42-45   -0.533 0.9751 -2.9832 1.9173  False
 30-41    46+   0.5587 0.8585  -0.991 2.1083  False
 42-45    46+   1.0916  0.736 -1.3586 3.5419  False
---------------------------------------------------
TRUST ~ C(AGE_RANGE)
ANOVA Results:
                  sum_sq     df        

ANOVA Results:
               sum_sq     df         F    PR(>F)
C(GENDER)    8.481630    3.0  2.741969  0.044347
Residual   207.248565  201.0       NaN       NaN
Tukey's HSD Test Results:
                   Multiple Comparison of Means - Tukey HSD, FWER=0.05                   
          group1                    group2          meandiff p-adj   lower  upper  reject
-----------------------------------------------------------------------------------------
                   Female                      Male  -0.0407 0.9924 -0.4185 0.3371  False
                   Female Non-binary / third gender  -0.1056 0.9934 -1.1345 0.9232  False
                   Female         Prefer not to say   1.4363 0.0306  0.0947  2.778   True
                     Male Non-binary / third gender  -0.0649 0.9984 -1.0952 0.9653  False
                     Male         Prefer not to say   1.4771 0.0247  0.1343 2.8198   True
Non-binary / third gender         Prefer not to say    1.542 0.0761 -0.1068 3.1908  False
--

In [29]:
# (EL Code) One-Way Anova for Instructors
variables = ['AWARENESS', 'BENEFITS', 'LIMITATIONS', 'IMPLICATIONS',
             'WORK_PRODUCTIVITY', 'PLAGIARISM', 'SOCIAL_IMPACT',
             'HIGHERED_BENEFIT', 'USAGE', 'RESP_USE_TRUST_BENEF', 'TRUST']
factors = ['AGE_RANGE' , 'GENDER']

for factor_name in factors:
    for target_name in variables:
        
        # One-way ANOVA
        formula = f"{target_name} ~ C({factor_name})"
        print(formula)
        
        # Perform the ANOVA
        model = sm.formula.ols(formula, data=instructors).fit()
        anova_table = sm.stats.anova_lm(model, typ=2)

        # Perform Tukey's HSD test
        mc = MultiComparison(instructors[target_name], instructors[factor_name])
        tukey_result = mc.tukeyhsd()

        print("ANOVA Results:")
        print(anova_table)

        print("Tukey's HSD Test Results:")
        print(tukey_result)

AWARENESS ~ C(AGE_RANGE)
ANOVA Results:
                  sum_sq    df        F    PR(>F)
C(AGE_RANGE)    3.377481   4.0  0.72592  0.576463
Residual      107.011922  92.0      NaN       NaN
Tukey's HSD Test Results:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
 18-21  22-29   0.1668 0.9999 -2.9475 3.2812  False
 18-21  30-41  -0.2357 0.9995 -3.3014 2.8299  False
 18-21  42-45  -0.6024 0.9841 -3.7658  2.561  False
 18-21    46+  -0.2552 0.9993 -3.2855 2.7752  False
 22-29  30-41  -0.4026 0.8185 -1.4439 0.6388  False
 22-29  42-45  -0.7693  0.473 -2.0706 0.5321  False
 22-29    46+   -0.422 0.7165 -1.3544 0.5104  False
 30-41  42-45  -0.3667  0.909 -1.5467 0.8133  False
 30-41    46+  -0.0194    1.0 -0.7732 0.7344  False
 42-45    46+   0.3473 0.8997 -0.7377 1.4323  False
---------------------------------------------------
BENEFITS ~ C(AGE_RANGE)
ANOVA Results:
                 

ANOVA Results:
                 sum_sq    df        F    PR(>F)
C(AGE_RANGE)   3.727335   4.0  2.02614  0.097206
Residual      42.311338  92.0      NaN       NaN
Tukey's HSD Test Results:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
 18-21  22-29   0.1137 0.9998 -1.8446  2.072  False
 18-21  30-41   0.1099 0.9999 -1.8177 2.0376  False
 18-21  42-45   0.2872 0.9944  -1.702 2.2763  False
 18-21    46+   0.5228 0.9404 -1.3827 2.4283  False
 22-29  30-41  -0.0037    1.0 -0.6585 0.6511  False
 22-29  42-45   0.1735 0.9763 -0.6448 0.9918  False
 22-29    46+   0.4091  0.303 -0.1772 0.9954  False
 30-41  42-45   0.1772 0.9634 -0.5647 0.9192  False
 30-41    46+   0.4129 0.1182 -0.0611 0.8868  False
 42-45    46+   0.2356 0.8718 -0.4467 0.9179  False
---------------------------------------------------
TRUST ~ C(AGE_RANGE)
ANOVA Results:
                 sum_sq    df         F    PR(>F

ANOVA Results:
              sum_sq    df         F    PR(>F)
C(GENDER)   1.071561   3.0  0.394726  0.757086
Residual   84.155467  93.0       NaN       NaN
Tukey's HSD Test Results:
                   Multiple Comparison of Means - Tukey HSD, FWER=0.05                   
          group1                    group2          meandiff p-adj   lower  upper  reject
-----------------------------------------------------------------------------------------
                   Female                      Male  -0.0591 0.9908 -0.5782 0.4601  False
                   Female Non-binary / third gender   0.0348    1.0 -1.7572 1.8268  False
                   Female         Prefer not to say   0.9734 0.7417 -1.5382 3.4849  False
                     Male Non-binary / third gender   0.0939 0.9991 -1.7093  1.897  False
                     Male         Prefer not to say   1.0325 0.7073  -1.487  3.552  False
Non-binary / third gender         Prefer not to say   0.9386 0.8516 -2.1093 3.9865  False
--------

# Analysis of Variance - Multi-Way ANOVA (Students & Instructors)

In [32]:
# (EL Code) 2-Way ANOVA for Students - Main Effects & Interactions 

# Define the list of variables
variables = ['AWARENESS', 'BENEFITS', 'LIMITATIONS', 'IMPLICATIONS',
             'WORK_PRODUCTIVITY', 'PLAGIARISM', 'SOCIAL_IMPACT',
             'HIGHERED_BENEFIT', 'USAGE', 'RESP_USE_TRUST_BENEF', 'TRUST']

# Define the factors
factors = ['AGE_RANGE', 'GENDER']

for target_name in variables:
    # Multi-way ANOVA
    formula = f"{target_name} ~ C(AGE_RANGE) * C(GENDER)"
    print(formula)

    # Perform the multi-way ANOVA
    model = sm.formula.ols(formula, data=students).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)

    print(f"Multi-way ANOVA Results:")
    print(anova_table)

    # Perform Tukey's HSD for pairwise comparisons for each factor
    for factor in factors:
        mc = MultiComparison(students[target_name], students[factor])
        result = mc.tukeyhsd()

        print(f"Tukey's HSD Results for {factor}:")
        print(result)

AWARENESS ~ C(AGE_RANGE) * C(GENDER)
Multi-way ANOVA Results:
                            sum_sq     df         F    PR(>F)
C(AGE_RANGE)                   NaN    4.0       NaN       NaN
C(GENDER)                      NaN    3.0       NaN       NaN
C(AGE_RANGE):C(GENDER)   12.779228   12.0  1.299751  0.265629
Residual                158.132230  193.0       NaN       NaN
Tukey's HSD Results for AGE_RANGE:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
 18-21  22-29   0.2381 0.6464 -0.2397  0.716  False
 18-21  30-41  -0.0989 0.9996 -1.4244 1.2266  False
 18-21  42-45  -0.6411 0.9622 -3.2675 1.9853  False
 18-21    46+   1.2566 0.0724 -0.0689 2.5821  False
 22-29  30-41   -0.337  0.962 -1.7151  1.041  False
 22-29  42-45  -0.8792  0.892 -3.5326 1.7741  False
 22-29    46+   1.0184 0.2534 -0.3596 2.3965  False
 30-41  42-45  -0.5422 0.9863 -3.4694  2.385  False
 30-41    46+   1.355

Tukey's HSD Results for AGE_RANGE:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
 18-21  22-29   0.0033    1.0 -0.3778 0.3843  False
 18-21  30-41  -0.0175    1.0 -1.0744 1.0395  False
 18-21  42-45   -0.233 0.9981 -2.3273 1.8613  False
 18-21    46+   0.3831 0.8561 -0.6738   1.44  False
 22-29  30-41  -0.0207    1.0 -1.1196 1.0781  False
 22-29  42-45  -0.2363  0.998 -2.3521 1.8795  False
 22-29    46+   0.3798 0.8762  -0.719 1.4787  False
 30-41  42-45  -0.2156 0.9991 -2.5497 2.1186  False
 30-41    46+   0.4006 0.9451 -1.0757 1.8768  False
 42-45    46+   0.6161 0.9501  -1.718 2.9503  False
---------------------------------------------------
Tukey's HSD Results for GENDER:
                   Multiple Comparison of Means - Tukey HSD, FWER=0.05                   
          group1                    group2          meandiff p-adj   lower  upper  reject
-------------------------

Tukey's HSD Results for AGE_RANGE:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
 18-21  22-29   0.0409 0.9993 -0.4301 0.5119  False
 18-21  30-41   0.6254 0.6804 -0.6809 1.9318  False
 18-21  42-45  -0.8659 0.8886 -3.4544 1.7226  False
 18-21    46+   0.0876 0.9997 -1.2187 1.3939  False
 22-29  30-41   0.5846 0.7601 -0.7736 1.9427  False
 22-29  42-45  -0.9068 0.8749 -3.5218 1.7082  False
 22-29    46+   0.0467    1.0 -1.3114 1.4049  False
 30-41  42-45  -1.4914 0.6135 -4.3763 1.3936  False
 30-41    46+  -0.5378 0.9269 -2.3625 1.2868  False
 42-45    46+   0.9535 0.8929 -1.9314 3.8385  False
---------------------------------------------------
Tukey's HSD Results for GENDER:
                   Multiple Comparison of Means - Tukey HSD, FWER=0.05                    
          group1                    group2          meandiff p-adj   lower   upper  reject
-----------------------

In [33]:
# (EL Code) 2-Way ANOVA for Instructors - Main Effects & Interactions 

# Define the list of variables
variables = ['AWARENESS', 'BENEFITS', 'LIMITATIONS', 'IMPLICATIONS',
             'WORK_PRODUCTIVITY', 'PLAGIARISM', 'SOCIAL_IMPACT',
             'HIGHERED_BENEFIT', 'USAGE', 'RESP_USE_TRUST_BENEF', 'TRUST']

# Define the factors
factors = ['AGE_RANGE', 'GENDER']

for target_name in variables:
    # Multi-way ANOVA
    formula = f"{target_name} ~ C(AGE_RANGE) * C(GENDER)"
    print(formula)

    # Perform the multi-way ANOVA
    model = sm.formula.ols(formula, data=instructors).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)

    print(f"Multi-way ANOVA Results:")
    print(anova_table)

    # Perform Tukey's HSD for pairwise comparisons for each factor
    for factor in factors:
        mc = MultiComparison(instructors[target_name], instructors[factor])
        result = mc.tukeyhsd()

        print(f"Tukey's HSD Results for {factor}:")
        print(result)

AWARENESS ~ C(AGE_RANGE) * C(GENDER)
Multi-way ANOVA Results:
                            sum_sq    df         F    PR(>F)
C(AGE_RANGE)              0.538870   4.0  0.111944  0.738756
C(GENDER)                      NaN   3.0       NaN       NaN
C(AGE_RANGE):C(GENDER)    7.033432  12.0  0.487040  0.816361
Residual                103.495206  86.0       NaN       NaN
Tukey's HSD Results for AGE_RANGE:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
 18-21  22-29   0.1668 0.9999 -2.9475 3.2812  False
 18-21  30-41  -0.2357 0.9995 -3.3014 2.8299  False
 18-21  42-45  -0.6024 0.9841 -3.7658  2.561  False
 18-21    46+  -0.2552 0.9993 -3.2855 2.7752  False
 22-29  30-41  -0.4026 0.8185 -1.4439 0.6388  False
 22-29  42-45  -0.7693  0.473 -2.0706 0.5321  False
 22-29    46+   -0.422 0.7165 -1.3544 0.5104  False
 30-41  42-45  -0.3667  0.909 -1.5467 0.8133  False
 30-41    46+  -0.0194    

Tukey's HSD Results for AGE_RANGE:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
 18-21  22-29   0.0528    1.0  -2.093 2.1985  False
 18-21  30-41   0.3792 0.9872  -1.733 2.4913  False
 18-21  42-45    0.453 0.9779 -1.7265 2.6326  False
 18-21    46+   0.5356 0.9528 -1.5523 2.6234  False
 22-29  30-41   0.3264 0.7126 -0.3911 1.0439  False
 22-29  42-45   0.4003 0.7268 -0.4964 1.2969  False
 22-29    46+   0.4828 0.2327 -0.1596 1.1252  False
 30-41  42-45   0.0738 0.9991 -0.7391 0.8868  False
 30-41    46+   0.1564  0.918 -0.3629 0.6757  False
 42-45    46+   0.0826  0.998  -0.665 0.8301  False
---------------------------------------------------
Tukey's HSD Results for GENDER:
                   Multiple Comparison of Means - Tukey HSD, FWER=0.05                   
          group1                    group2          meandiff p-adj   lower  upper  reject
-------------------------

Tukey's HSD Results for AGE_RANGE:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
 18-21  22-29  -0.6562 0.9359 -2.9987 1.6864  False
 18-21  30-41  -0.9625 0.7731 -3.2684 1.3433  False
 18-21  42-45  -0.7435 0.9073 -3.1229  1.636  False
 18-21    46+  -0.5469 0.9628 -2.8262 1.7325  False
 22-29  30-41  -0.3064 0.8121 -1.0896 0.4769  False
 22-29  42-45  -0.0873 0.9991 -1.0661 0.8916  False
 22-29    46+   0.1093 0.9925  -0.592 0.8107  False
 30-41  42-45   0.2191 0.9589 -0.6685 1.1066  False
 30-41    46+   0.4157 0.2554 -0.1513 0.9827  False
 42-45    46+   0.1966 0.9623 -0.6195 1.0128  False
---------------------------------------------------
Tukey's HSD Results for GENDER:
                   Multiple Comparison of Means - Tukey HSD, FWER=0.05                    
          group1                    group2          meandiff p-adj   lower   upper  reject
-----------------------