## Data Preprocessing

In [19]:
## load packages 
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
# First step: Set working directory/ root_path 
root_path = 'D:/'

# Load Question Dataset

In [3]:
# Get descriptive questions since they will be used for describing proxy for believing in superstition
questions = pd.read_csv(os.path.join(root_path, 'question_data.csv'), sep=';')  # as semicolon is separating the columns
questions = questions.set_index('Unnamed: 0')  # set first column (question number) as index
questions.index.rename('q_id', inplace=True) #rename question index

descriptive_questions = questions[questions['Keywords']=='descriptive'].index.to_list()  #subsetting (only questions that are described as descriptive)
question_to_id = {question_id: 'dq'+str(i) for i, question_id in enumerate(descriptive_questions)} # create new identifier for questions
descriptive_questions = list(question_to_id.values()) 
questions.rename(index=question_to_id, inplace=True)
display(questions.loc[descriptive_questions].head())

Unnamed: 0_level_0,text,option_1,option_2,option_3,option_4,N,Type,Order,Keywords
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
dq0,Which word describes you better?,Carefree,Intense,,,49827,N,,descriptive
dq1,Have you ever seen a therapist?,Yes,No,,,9507,O,,descriptive
dq2,Which describes you better?,Warm-hearted,Cool-headed,,,21205,O,,descriptive
dq3,Are you a better conversationalist or listener?,Conversationalist,Listener,,,3174,O,,descriptive
dq4,Which word describes you better?,Private,Social,,,23765,O,,descriptive


Add missing entries from options in "Order" column looping over descriptive questions 

In [4]:
#add missing entries in "Order" column looping over descriptive questions 
for index, row in questions.loc[descriptive_questions].iterrows():
    if type(row['Order'])!=list and str(row['option_1']) != 'nan':   
        order = [row['option_1'], row['option_2']]
        if isinstance(row['option_3'], str):
            order.append(row['option_3'])
        if isinstance(row['option_4'], str):
            order.append(row['option_4'])
        questions.at[index, 'Order']=order
display(questions.loc[descriptive_questions].head())

Unnamed: 0_level_0,text,option_1,option_2,option_3,option_4,N,Type,Order,Keywords
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
dq0,Which word describes you better?,Carefree,Intense,,,49827,N,"[Carefree, Intense]",descriptive
dq1,Have you ever seen a therapist?,Yes,No,,,9507,O,"[Yes, No]",descriptive
dq2,Which describes you better?,Warm-hearted,Cool-headed,,,21205,O,"[Warm-hearted, Cool-headed]",descriptive
dq3,Are you a better conversationalist or listener?,Conversationalist,Listener,,,3174,O,"[Conversationalist, Listener]",descriptive
dq4,Which word describes you better?,Private,Social,,,23765,O,"[Private, Social]",descriptive


Save modified questions dataset (modified the descriptive questions) in csv file 

In [5]:
questions.to_csv(os.path.join(root_path, 'questions_prepared.csv'), sep=';')

# Answer Data

Load data with answers

In [5]:
data = pd.read_parquet(os.path.join(root_path, 'parsed_data_public.parquet'))
Reading_horoscope = 'q37764'
Astrology = 'q50483'
data.rename({'Unnamed: 0': 'User_ID',**question_to_id, 'q37764': 'Reading_horoscope', 'q50483': 'Astrology' }, axis=1, inplace=True) # rename horoscope and user id question
print('shape', data.shape)
display(data.head())

shape (68371, 2626)


Unnamed: 0,User_ID,q2,q11,q12,q13,q14,q16,q17,q18,q20,...,q86615,q86699,dq822,CA,gender_orientation,gender,race,gender2,gender2_num,CA_items
0,1,,Horrified,,,,,No,,,...,,,,0.76308,Hetero_female,Woman,White,Woman,0.0,4
1,2,,,,,,,,,,...,,,,,Hetero_male,Man,,Man,1.0,0
2,3,,,,No,No,,No,,,...,,,,0.661309,Hetero_female,Woman,,Woman,0.0,7
3,4,,,,,,,,,,...,,,,,Hetero_female,Woman,White,Woman,0.0,0
4,5,,,,,,,,,,...,,,,0.875424,Bisexual_female,Woman,,Woman,0.0,3


In [6]:
## All questions except descriptive and horoscope
not_descriptive = {column for column in data.columns if column.startswith('q')}
## Drop non-descriptive questions from data set
data = data.drop(not_descriptive, axis=1)
## Extract answered horoscope
data = data[data['Reading_horoscope'].notna()]
data = data[data['Astrology'].notna()]

In [7]:
print(data[question_to_id.values()].shape)
data.head()

(16659, 829)


Unnamed: 0,User_ID,dq0,dq1,dq2,dq3,dq4,dq5,dq6,dq7,dq828,...,dq809,dq810,dq822,CA,gender_orientation,gender,race,gender2,gender2_num,CA_items
25,26,Carefree,,,,,,,,No,...,,,,0.730555,Hetero_male,Man,Mixed,Man,1.0,5
33,34,Intense,,Warm-hearted,,Social,,,,Yes,...,,,,0.588296,,,,,,11
35,36,Carefree,,,,Social,,,,,...,,,,1.322086,Hetero_female,Woman,White,Woman,0.0,6
48,49,Intense,,Warm-hearted,,Private,,,Fast,No,...,,,,0.289208,Hetero_female,Woman,White,Woman,0.0,3
107,108,Intense,,,,,,,,No,...,,,,-0.43748,,,,,,2


# Descriptive Statistics

In [12]:
# General Info about individuals in data set

#gender
gender_distribution = data['gender'].value_counts(normalize=True) * 100
print("Gender Distribution (in %):")
print(gender_distribution)

#age
age_mean = data['d_age'].mean()
age_std = data['d_age'].std()
print("\nAge Statistics:")
print(f"Mean: {age_mean:.2f}")
print(f"Standard Deviation: {age_std:.2f}")

#calculate number of unique countries
unique_countries = data['d_country'].nunique()
print("\nNumber of Unique Countries:")
print(unique_countries)

Gender Distribution (in %):
gender
Man      71.516268
Woman    27.894414
Other     0.589319
Name: proportion, dtype: float64

Age Statistics:
Mean: 34.26
Standard Deviation: 7.75

Number of Unique Countries:
159


# Train and Test Set

In [8]:
df_train, df_test = train_test_split(data, shuffle=True, test_size=0.1, random_state=42)
df_train.to_parquet(os.path.join(root_path, 'train.parquet'))
df_test.to_parquet(os.path.join(root_path, 'test.parquet'))

In [10]:
#Checking if it worked, check if 'reading_horoscope' exists in the Dataframe
if 'Reading_horoscope' in data.columns:
    # Filter the DataFrame to include only the required columns
    data_filtered = data[['User_ID', 'Reading_horoscope']]
    
    # Display the shape and head of the filtered DataFrame
    print('shape', data_filtered.shape)
    display(data_filtered.head())
else:
    print("'Reading_horoscope' variable is not in the data")


shape (16659, 2)


Unnamed: 0,User_ID,Reading_horoscope
25,26,Never.
33,34,Occasionally.
35,36,Never.
48,49,Never.
107,108,Occasionally.
