In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
# Load the CSV file
file_path = '../../data/survey/Twitter_May_29,_2024_01.07.csv'
df = pd.read_csv(file_path)

# Display the DataFrame
df.head()

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,...,Immigrants_1,Helping needy_1,Country,State,Education,Gender,Gender_4_TEXT,Ethnicity,Income,Age
0,Start Date,End Date,Response Type,Progress,Duration (in seconds),Finished,Recorded Date,Response ID,Distribution Channel,User Language,...,Immigrants,Helping needy,List of Countries,"50 States, D.C. and Puerto Rico",What is the highest level of school you have c...,How do you describe yourself? - Selected Choice,How do you describe yourself? - Prefer to self...,Choose one or more races that you consider you...,What was your total household income before ta...,How old are you?
1,"{""ImportId"":""startDate"",""timeZone"":""America/De...","{""ImportId"":""endDate"",""timeZone"":""America/Denv...","{""ImportId"":""status""}","{""ImportId"":""progress""}","{""ImportId"":""duration""}","{""ImportId"":""finished""}","{""ImportId"":""recordedDate"",""timeZone"":""America...","{""ImportId"":""_recordId""}","{""ImportId"":""distributionChannel""}","{""ImportId"":""userLanguage""}",...,"{""ImportId"":""QID61_1""}","{""ImportId"":""QID78_1""}","{""ImportId"":""QID16""}","{""ImportId"":""QID1215139908""}","{""ImportId"":""QID18""}","{""ImportId"":""QID88""}","{""ImportId"":""QID88_4_TEXT""}","{""ImportId"":""QID57""}","{""ImportId"":""QID62""}","{""ImportId"":""QID65_TEXT""}"
2,2024-03-18 08:46:12,2024-03-18 08:47:02,IP Address,100,50,True,2024-03-18 08:47:03,R_3sX6aUYcY3Gqqs7,anonymous,EN,...,,,,,,,,,,
3,2024-03-18 08:48:31,2024-03-18 08:56:43,IP Address,100,492,True,2024-03-18 08:56:44,R_4gjckliObAD0deV,anonymous,EN,...,-4,-2,Netherlands,,High school graduate (high school diploma or e...,Prefer to self-describe,Sexy,American Indian/Native American or Alaska Nati...,Prefer not to say,46
4,2024-03-18 09:04:24,2024-03-18 09:04:50,IP Address,100,25,True,2024-03-18 09:04:50,R_8hzk6eehMQyDmA1,anonymous,EN,...,,,,,,,,,,


In [15]:
# Display basic information about the DataFrame
print("Basic Information:")
df.info()

Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 48 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   StartDate                   97 non-null     object
 1   EndDate                     97 non-null     object
 2   Status                      97 non-null     object
 3   Progress                    97 non-null     object
 4   Duration (in seconds)       97 non-null     object
 5   Finished                    97 non-null     object
 6   RecordedDate                97 non-null     object
 7   ResponseId                  97 non-null     object
 8   DistributionChannel         97 non-null     object
 9   UserLanguage                97 non-null     object
 10  Q_RecaptchaScore            97 non-null     object
 11  Q_RelevantIDDuplicate       5 non-null      object
 12  Q_RelevantIDDuplicateScore  97 non-null     object
 13  Q_RelevantIDFraudScore      97 no

In [16]:
# Show summary statistics for numeric columns
print("Summary Statistics for Numeric Columns:")
df.describe()

Summary Statistics for Numeric Columns:


Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,...,Immigrants_1,Helping needy_1,Country,State,Education,Gender,Gender_4_TEXT,Ethnicity,Income,Age
count,97,97,97,97,97,97,97,97,97,97,...,61,58,57,7,57,57,4,57,57,56
unique,97,97,3,20,88,4,97,97,3,3,...,12,10,11,6,7,7,4,12,9,25
top,Start Date,End Date,IP Address,100,23,True,Recorded Date,Response ID,anonymous,EN,...,-3,-5,Netherlands,California,Bachelor's degree (3 or 4-year),Male,How do you describe yourself? - Prefer to self...,White or Caucasian,"Less than $25,000",21
freq,1,1,95,67,4,67,1,1,95,95,...,12,12,22,2,22,26,1,41,13,8


In [17]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
print(missing_values)

Missing Values:
StartDate                      0
EndDate                        0
Status                         0
Progress                       0
Duration (in seconds)          0
Finished                       0
RecordedDate                   0
ResponseId                     0
DistributionChannel            0
UserLanguage                   0
Q_RecaptchaScore               0
Q_RelevantIDDuplicate         92
Q_RelevantIDDuplicateScore     0
Q_RelevantIDFraudScore         0
Q_RelevantIDLastStartDate     92
Consent form                   0
Twitter use                    5
AI understanding              33
Application                   33
Experience                    32
Moderation                    32
Benefit to user               38
Fairness                      37
Trust algorithm mod.          37
Human moderation              35
Race discr.                   38
Sexual orient discr.          41
Gender discr.                 40
Religion discr.               39
Effective moderation       

In [18]:
# Convert the "Finished" column to string type
df['Finished'] = df['Finished'].astype(str)

# Drop rows where the "Finished" column has the string "False"
df = df[df['Finished'] != 'False']

# List of columns to check for unique values
columns_to_check = [
    'Q_RecaptchaScore',
    'Q_RelevantIDDuplicate',
    'Q_RelevantIDDuplicateScore',
    'Q_RelevantIDFraudScore',
    'Q_RelevantIDLastStartDate'
]

# Function to print unique values and their counts for specified columns
def print_unique_values_and_counts(df, columns):
    for column in columns:
        if column in df.columns:
            unique_values_counts = df[column].value_counts()
            print(f"Unique values and counts in '{column}' column:")
            print(unique_values_counts)
            print("\n")
        else:
            print(f"Column '{column}' does not exist in the DataFrame.")

# Execute the function to print unique values and counts
print_unique_values_and_counts(df, columns_to_check)

# Convert Q_RelevantIDDuplicateScore and Q_RelevantIDFraudScore to string to handle non-numeric values
df['Q_RelevantIDDuplicateScore'] = df['Q_RelevantIDDuplicateScore'].astype(str)

# Drop rows where 'Q_RelevantIDDuplicateScore' is not zero
df = df[df['Q_RelevantIDDuplicateScore'] == '0']

# Display unique values in the "Twitter use" column
unique_values = df['Twitter use'].unique()
print("Unique values in 'Twitter use' column:")
print(unique_values)

# Convert all values in the "Twitter use" column to string
df['Twitter use'] = df['Twitter use'].astype(str)

# Drop all rows that have "Never" as a value in the "Twitter use" column
df = df[df['Twitter use'] != 'Never']

Unique values and counts in 'Q_RecaptchaScore' column:
Q_RecaptchaScore
0.8999999761581421                 37
1                                  12
0.699999988079071                   8
0.800000011920929                   6
0.4000000059604645                  2
Q_RecaptchaScore                    1
{"ImportId":"Q_RecaptchaScore"}     1
0.6000000238418579                  1
0.20000000298023224                 1
Name: count, dtype: int64


Unique values and counts in 'Q_RelevantIDDuplicate' column:
Q_RelevantIDDuplicate
true                                    3
Q_RelevantIDDuplicate                   1
{"ImportId":"Q_RelevantIDDuplicate"}    1
Name: count, dtype: int64


Unique values and counts in 'Q_RelevantIDDuplicateScore' column:
Q_RelevantIDDuplicateScore
0                                            64
Q_RelevantIDDuplicateScore                    1
{"ImportId":"Q_RelevantIDDuplicateScore"}     1
98                                            1
100                                   

In [19]:
# List of columns to drop
columns_to_drop = [
    "Status", "DistributionChannel", "Q_RecaptchaScore",
    "Q_RelevantIDDuplicate", "Q_RelevantIDDuplicateScore",
    "Q_RelevantIDFraudScore", "Q_RelevantIDLastStartDate",
    "Consent form"
]

# Drop the columns
df = df.drop(columns=columns_to_drop, errors='ignore')

# Display the columns in the cleaned DataFrame
cleaned_columns = df.columns.tolist()

# Display the first few rows to understand the data
first_cleaned_rows = df.head()

cleaned_columns, first_cleaned_rows


(['StartDate',
  'EndDate',
  'Progress',
  'Duration (in seconds)',
  'Finished',
  'RecordedDate',
  'ResponseId',
  'UserLanguage',
  'Twitter use',
  'AI understanding',
  'Application',
  'Experience',
  'Moderation',
  'Benefit to user',
  'Fairness',
  'Trust algorithm mod.',
  'Human moderation',
  'Race discr.',
  'Sexual orient discr.',
  'Gender discr.',
  'Religion discr.',
  'Effective moderation',
  'AI acceleration',
  'AI + human',
  'Hate speech',
  'Misinformation',
  'Disinformation',
  'Political spectrum_1',
  'Minimum wage_1',
  'Schooling_1',
  'Immigrants_1',
  'Helping needy_1',
  'Country',
  'State',
  'Education',
  'Gender',
  'Gender_4_TEXT',
  'Ethnicity',
  'Income',
  'Age'],
               StartDate              EndDate Progress Duration (in seconds)  \
 6   2024-03-18 09:03:45  2024-03-18 09:10:58      100                   433   
 8   2024-03-18 12:46:01  2024-03-18 12:49:40      100                   219   
 9   2024-03-18 10:23:40  2024-03-18 13:16

In [20]:
# List of columns to process
columns_to_process = [
    "AI understanding", "Application", "Experience", "Moderation", 
    "Benefit to user", "Fairness", "Trust algorithm mod.", "Human moderation", 
    "Race discr.", "Sexual orient discr.", "Gender discr.", "Religion discr.", 
    "Effective moderation", "AI acceleration", "AI + human", "Hate speech", 
    "Misinformation", "Disinformation"
]

# Function to print value counts for each column in the list
def print_value_counts(df, columns):
    for column in columns:
        if column in df.columns:
            print(f"Value counts for '{column}':")
            print(df[column].value_counts(dropna=False))
            print("\n")
        else:
            print(f"Column '{column}' does not exist in the DataFrame.\n")

# Print value counts for the specified columns
print_value_counts(df, columns_to_process)


Value counts for 'AI understanding':
AI understanding
Agree                         16
Strongly Agree                12
Somewhat Agree                12
Neither agree nor disagree     3
Disagree                       2
Somewhat Disagree              2
Name: count, dtype: int64


Value counts for 'Application':
Application
Agree                         24
Somewhat Agree                10
Strongly Agree                 8
Somewhat Disagree              3
Strongly Disagree              1
Neither agree nor disagree     1
Name: count, dtype: int64


Value counts for 'Experience':
Experience
Agree                         11
Somewhat Agree                10
Strongly Agree                 7
Disagree                       7
Strongly Disagree              5
Somewhat Disagree              4
Neither agree nor disagree     3
Name: count, dtype: int64


Value counts for 'Moderation':
Moderation
Agree                         15
Somewhat Agree                13
Strongly Agree                 8
Somewhat

In [21]:
# List of columns to process
columns_to_process = [
    "AI understanding", "Application", "Experience", "Moderation", 
    "Benefit to user", "Fairness", "Trust algorithm mod.", "Human moderation", 
    "Race discr.", "Sexual orient discr.", "Gender discr.", "Religion discr.", 
    "Effective moderation", "AI acceleration", "AI + human", "Hate speech", 
    "Misinformation", "Disinformation"
]

# Valid responses
valid_responses = [
    "strongly disagree", "disagree", "somewhat disagree", 
    "neither agree nor disagree", "somewhat agree", "agree", "strongly agree"
]

# Likert-scale mapping
likert_scale_mapping = {
    "strongly disagree": 1,
    "disagree": 2,
    "somewhat disagree": 3,
    "neither agree nor disagree": 4,
    "somewhat agree": 5,
    "agree": 6,
    "strongly agree": 7  
}

# Convert all values to strings and then to lowercase
for column in columns_to_process:
    df[column] = df[column].astype(str).str.lower()

# Replace "i don't know" with NaN explicitly if needed
df[columns_to_process] = df[columns_to_process].replace("don't know", np.nan)

# Convert phrases to Likert-scale numbers, leaving NaNs as is
for column in columns_to_process:
    df[column] = df[column].map(likert_scale_mapping)

# Function to check if all values are valid, including NaN as valid
def check_valid_responses(df, columns, valid_responses):
    for column in columns:
        unique_values = df[column].unique()
        for value in unique_values:
            if pd.isna(value):
                continue  # Skip NaN values
            if value not in valid_responses:
                print(f"Invalid value '{value}' found in column '{column}'")

# Convert valid_responses to their numeric counterparts for validation
numeric_valid_responses = [likert_scale_mapping[response] for response in valid_responses]

# Check for invalid values
check_valid_responses(df, columns_to_process, numeric_valid_responses + [np.nan])


In [22]:
# List of columns to process
columns_to_process = [
    "AI understanding", "Application", "Experience", "Moderation", 
    "Benefit to user", "Fairness", "Trust algorithm mod.", "Human moderation", 
    "Race discr.", "Sexual orient discr.", "Gender discr.", "Religion discr.", 
    "Effective moderation", "AI acceleration", "AI + human", "Hate speech", 
    "Misinformation", "Disinformation"
]

# Function to print value counts for each column in the list
def print_value_counts(df, columns):
    for column in columns:
        if column in df.columns:
            print(f"Value counts for '{column}':")
            print(df[column].value_counts(dropna=False))
            print("\n")
        else:
            print(f"Column '{column}' does not exist in the DataFrame.\n")

# Print value counts for the specified columns
print_value_counts(df, columns_to_process)


Value counts for 'AI understanding':
AI understanding
6    16
7    12
5    12
4     3
2     2
3     2
Name: count, dtype: int64


Value counts for 'Application':
Application
6    24
5    10
7     8
3     3
1     1
4     1
Name: count, dtype: int64


Value counts for 'Experience':
Experience
6    11
5    10
7     7
2     7
1     5
3     4
4     3
Name: count, dtype: int64


Value counts for 'Moderation':
Moderation
6    15
5    13
7     8
3     6
4     3
1     1
2     1
Name: count, dtype: int64


Value counts for 'Benefit to user':
Benefit to user
5    15
3     8
6     7
4     6
2     6
7     3
1     2
Name: count, dtype: int64


Value counts for 'Fairness':
Fairness
2    11
1    10
4    10
5     7
3     7
6     2
Name: count, dtype: int64


Value counts for 'Trust algorithm mod.':
Trust algorithm mod.
2    13
5    11
1     9
3     7
6     5
4     2
Name: count, dtype: int64


Value counts for 'Human moderation':
Human moderation
5    17
6     9
3     7
4     7
2     5
7     2
Name: co

In [23]:
df['Disinformation'].value_counts()

Disinformation
1    13
2    11
5     8
3     6
6     5
4     4
Name: count, dtype: int64

In [24]:
# List of columns to process
politics_columns = [
    "Political spectrum_1", "Minimum wage_1", "Schooling_1", 
    "Immigrants_1", "Helping needy_1"
]

# Convert all values to strings
for column in politics_columns:
    df[column] = df[column].astype(str)

# Function to get unique values for each column
def get_unique_values(df, columns):
    for column in columns:
        unique_values = df[column].unique()
        print(f"Unique values in '{column}' column:")
        print(unique_values)
        print("\n")

# Get unique values for the specified columns
get_unique_values(df, politics_columns)

Unique values in 'Political spectrum_1' column:
['2' '1' '8' '3' '9' '0' '6' 'nan' '5' '7' '4']


Unique values in 'Minimum wage_1' column:
['-3' '-4' '-5' '-1' '3' '-2' '2' '0' '1' '5']


Unique values in 'Schooling_1' column:
['-5' '3' '-3' '-4' '0' '-2' '1']


Unique values in 'Immigrants_1' column:
['-3' '-5' '0' '-2' '-1' '-4' '5' '1' '2' '3']


Unique values in 'Helping needy_1' column:
['-4' '-5' '1' 'nan' '2' '0' '-2' '-1' '-3']




In [25]:
# Columns to measure familiarity with AI and its application in content moderation
familiarity_columns = ["AI understanding", "Application", "Experience", "Moderation"]

# Function to get unique values for each column
def get_unique_values(df, columns):
    for column in columns:
        unique_values = df[column].unique()
        print(f"Unique values in '{column}' column:")
        print(unique_values)
        print("\n")

# Get unique values for the specified columns
get_unique_values(df, familiarity_columns)



Unique values in 'AI understanding' column:
[4 7 5 6 2 3]


Unique values in 'Application' column:
[5 7 6 1 4 3]


Unique values in 'Experience' column:
[6 7 5 4 1 2 3]


Unique values in 'Moderation' column:
[3 6 5 1 2 4 7]




In [26]:
# List of columns to process
demographics_columns = [
    "Country", "State", "Education", "Gender", "Gender_4_TEXT", "Ethnicity", "Income", "Age"
]

# Convert all values to strings
for column in demographics_columns:
    df[column] = df[column].astype(str)

# Function to get unique values for each column
def get_unique_values(df, columns):
    for column in columns:
        unique_values = df[column].unique()
        print(f"Unique values in '{column}' column:")
        print(unique_values)
        print("\n")

# Get unique values for the specified columns
get_unique_values(df, demographics_columns)

Unique values in 'Country' column:
['Netherlands' 'United Kingdom of Great Britain and Northern Ireland'
 'Poland' 'Germany' 'Canada' 'United States of America' 'Sweden'
 'Pakistan']


Unique values in 'State' column:
['nan' 'California' 'New York' 'Nevada']


Unique values in 'Education' column:
['High school graduate (high school diploma or equivalent)'
 'Some tertiary education but no degree' "Bachelor's degree (3 or 4-year)"
 "Master's degree" 'Less than high school degree']


Unique values in 'Gender' column:
['Female' 'Male' 'Non-binary / third gender' 'Prefer not to say'
 'Prefer to self-describe']


Unique values in 'Gender_4_TEXT' column:
['nan' 'Hielke']


Unique values in 'Ethnicity' column:
['White or Caucasian' 'White or Caucasian,Other' 'Other'
 'White or Caucasian,Asian' 'White or Caucasian,Black or African American'
 'Prefer not to say' 'Asian,Other' 'Asian']


Unique values in 'Income' column:
['Prefer not to say' 'Less than $25,000' '$25,000-$49,999'
 '$150,000 or mor

In [27]:
df['AI understanding'].value_counts()

AI understanding
6    16
7    12
5    12
4     3
2     2
3     2
Name: count, dtype: int64

In [28]:
# Save the DataFrame to a CSV file
output_file_path = 'processed_survey_data.csv'
df.to_csv(output_file_path, index=False)

print(f"DataFrame saved to {output_file_path}")

DataFrame saved to processed_survey_data.csv
