In [1]:
import pandas as pd
import chardet
from pandasgui import show
from sklearn.preprocessing import LabelEncoder

### __Survey Data__

In [2]:
responses_df = pd.read_csv('./data/responses.csv')

In [3]:
def encode_column_names(columns):
    encoded_mapping = {}

    # Encode the first 18 columns as SQ1 to SQ18
    for i in range(1, 19):
        encoded_mapping[f"SQ{i}"] = columns[i]

    # Encode the next 5 columns as BQ1 to BQ5
    for i, col in enumerate(columns[19:24], start=1):
        encoded_mapping[f"BQ{i}"] = col

    # Encode the next 5 columns as DQ1 to DQ5
    for i, col in enumerate(columns[24:29], start=1):
        encoded_mapping[f"DQ{i}"] = col

    # Encode the last 5 columns as MQ1 to MQ5
    for i, col in enumerate(columns[29:], start=1):
        encoded_mapping[f"MQ{i}"] = col

    return encoded_mapping

In [4]:
# Get the original column names and generate the mapping
original_columns = responses_df.columns.tolist()
encoded_mapping = encode_column_names(original_columns)

# Rename columns in the DataFrame
responses_df.rename(columns={v: k for k, v in encoded_mapping.items()}, inplace=True)

# Step 2: Fill Missing Values
responses_df.fillna('Not Taken', inplace=True)

# Step 3: Encode Student IDs in 'SQ1'
label_encoder = LabelEncoder()
responses_df['SQ1'] = label_encoder.fit_transform(responses_df['SQ1'])
responses_df.rename(columns={'SQ1': 'SQ1_encoded'}, inplace=True)

# Save mapping for later use
id_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [5]:
replacement_map = {
    'ARTIFICIAL INTELLIGENCE IN PRODUCT INNOVATION': 'AIPI',
    'ARTIFICIAL INTELLIGENCE': 'AIPI',
    'AI': 'AIPI',
    'M.ENG AIPI': 'AIPI',
    'MENG IN AIPI': 'AIPI',
    'COMPUTER SCIENCE, PHYSICS': 'CS & PHYSICS',
    'PHYSICS, COMPUTER SCIENCE': 'CS & PHYSICS',
    'ECONOMIC S': 'ECON',
    'UNDECLARED - STATISTICAL SCIENCE': 'UNDECLARED/STATSCI',
    'LINGUISTICS AND SPANISH': 'LINGUISTICS & SPANISH',
    'MECHANICAL ENGINEERING': 'MECHE',
    'FGG': 'UNDECLARED'
}

# Normalize 'SQ2' and apply replacements
responses_df['SQ2'] = (
    responses_df['SQ2']
    .str.strip()
    .str.upper()
    .replace(replacement_map)
)

print("\nMapping of Encoded Column Names:")
print(encoded_mapping)

print("\nMapping of Original IDs to Encoded Values:")
print(id_mapping)


Mapping of Encoded Column Names:
{'SQ1': "What's your Duke Student ID?\n(Leave blank if not a Duke student)", 'SQ2': "What's your major?", 'SQ3': 'Are you an AIPI Student at Duke?', 'SQ4': 'Are you an Undergraduate or Graduate Student?', 'SQ5': 'Study Time Spent per day in Hours?', 'SQ6': 'How much do you feel you have participated in class?', 'SQ7': 'How much do you feel you use what you have learned outside of class?', 'SQ8': 'How comfortable are you with using generative AI tools like ChatGPT to assist with your learning?  ', 'SQ9': 'In which aspects of your learning have you used generative AI tools?', 'SQ10': 'How frequently do you use generative AI tools for learning-related tasks?', 'SQ11': 'To what extent do you feel generative AI has improved your academic performance?  ', 'SQ12': 'What concerns, if any, do you have about using generative AI in your learning process?  ', 'SQ13': 'How would you rate your overall academic performance in the past few weeks of the semester?', 'SQ

In [6]:
responses_df

Unnamed: 0,Timestamp,SQ1_encoded,SQ2,SQ3,SQ4,SQ5,SQ6,SQ7,SQ8,SQ9,...,DQ1,DQ2,DQ3,DQ4,DQ5,MQ1,MQ2,MQ3,MQ4,MQ5
0,10/15/2024 18:09:15,1,AIPI,Yes,Graduate,2,2,3,5,"Writing Assignments, General Problem-Solving, ...",...,To collect data from various sources for analy...,Verifying the quality and accuracy of the data,To extract real-time data from various online ...,It can extract data from websites without need...,Data lakes,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken
1,10/19/2024 9:28:01,28,AIPI,Yes,Graduate,6,2,3,5,"Writing Assignments, Research, General Problem...",...,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,No single algorithm performs best across all t...,There are significant multicollinearity issues...,The model is violating the assumption of linea...,The true population parameter lies within the ...,The model is too simple to capture the data's ...
2,10/19/2024 9:34:52,15,BUSINESS,No,Graduate,3,5,5,5,"Research, General Problem-Solving, Studying fo...",...,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken
3,10/19/2024 9:49:56,4,CS & PHYSICS,Yes,Graduate,4,4,5,5,"Writing Assignments, Research, General Problem...",...,To collect data from various sources for analy...,Verifying the quality and accuracy of the data,To extract real-time data from various online ...,It can extract data from websites without need...,Streaming data sources,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken
4,10/19/2024 9:54:08,3,UNDECLARED/STATSCI,No,Undergraduate,2,5,4,5,"Writing Assignments, Research, General Problem...",...,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken
5,10/19/2024 10:16:20,21,AIPI,Yes,Graduate,4,5,4,5,"Writing Assignments, General Problem-Solving",...,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,No single algorithm performs best across all t...,There are significant multicollinearity issues...,The model is violating the assumption of linea...,The interval contains the true population para...,The model is too simple to capture the data's ...
6,10/19/2024 11:40:50,11,ECON,Yes,Undergraduate,6,5,4,4,"Research, General Problem-Solving",...,To store data securely in a data warehouse,Verifying the quality and accuracy of the data,To store data in relational databases,It can extract data from websites without need...,Data lakes,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken
7,10/19/2024 23:56:37,25,AIPI,Yes,Graduate,3,2,2,4,"Writing Assignments, Research, General Problem...",...,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken
8,10/20/2024 13:47:36,16,AIPI,Yes,Graduate,5,3,4,3,"Research, General Problem-Solving, Studying fo...",...,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,No single algorithm performs best across all t...,There are significant multicollinearity issues...,The model is violating the assumption of linea...,The true population parameter lies within the ...,The model is too simple to capture the data's ...
9,10/22/2024 11:36:39,7,AIPI,Yes,Graduate,15,5,4,5,"Writing Assignments, General Problem-Solving, ...",...,Not Taken,Not Taken,Not Taken,Not Taken,Not Taken,No single algorithm performs best across all t...,There are significant multicollinearity issues...,The model is violating the assumption of linea...,The true population parameter lies within the ...,The model is too simple to capture the data's ...


### __Public Domain Data__

In [7]:
with open('./supplemental_material/coded_en.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print(result)


{'encoding': 'windows-1251', 'confidence': 0.936604085794506, 'language': 'Bulgarian'}


In [8]:
coded_en_df = pd.read_csv('./supplemental_material/coded_en.csv', encoding='windows-1251', delimiter=';')
coded_en_df.fillna('No Response', inplace=True)

In [9]:
coded_en_df

Unnamed: 0,Timestamp,Q1,Q2,Q3,Q4,Q5.1,Q5.2,Q5.3,Q5.4,Q5.5,...,Q8.2,Q8.3,Q8.4,Q8.5,Q9.1,Q9.2,Q9.3,Q9.4,Q9.5,Q10
0,5.14.2023 21:22:19,2,International Economic Relations,1,4,4,4,5,5,4,...,5,5,4,5,4,5,5,4,4,No Response
1,5.15.2023 8:45:52,1,International Economic Relations,1,3,2,4,2,1,3,...,4,4,2,3,3,4,3,2,2,No Response
2,5.15.2023 9:54:15,1,International Economic Relations,1,2,3,3,3,2,3,...,4,4,3,3,3,3,2,2,3,No Response
3,5.15.2023 15:17:41,1,International Economic Relations,1,2,4,4,4,2,4,...,4,4,4,4,4,4,3,4,4,No Response
4,5.15.2023 15:19:02,1,International Economic Relations,1,3,4,4,3,2,4,...,4,4,3,3,3,3,3,3,3,No Response
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,5.26.2023 15:44:55,1,International Economic Relations,1,4,4,3,4,4,4,...,4,4,2,3,3,4,3,4,4,ChtGPT helps me a lot in preparing for assignm...
127,5.26.2023 21:43:23,1,Finance,1,1,3,3,2,2,3,...,4,4,4,4,4,4,4,4,4,No Response
128,5.27.2023 10:28:36,1,Marketing,1,5,4,4,3,3,3,...,5,5,4,4,2,4,3,4,4,No Response
129,5.28.2023 0:51:54,1,Public Administration,2,4,4,4,4,4,3,...,4,4,4,4,2,3,3,3,3,No Response


### __PandasGUI__

In [10]:
show(responses_df)

PandasGUI INFO — pandasgui.gui — Opening PandasGUI


<pandasgui.gui.PandasGui at 0x2c4d947a170>