In [136]:
# Load the required libraries
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import wordnet
from nltk import pos_tag
import numpy as np
from spellchecker import SpellChecker

nltk.download('averaged_perceptron_tagger')


# Ensure that necessary NLTK resources are available
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/anachkhaidze/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anachkhaidze/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anachkhaidze/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/anachkhaidze/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [134]:
# Adjust display options
pd.set_option('display.max_columns', None)  # None means no limit

In [141]:
# Load the CSV files into Pandas DataFrames
hallucinations_df = pd.read_csv('hallucinations.csv')
lancaster_df = pd.read_csv('lancaster.csv')

# Rename the specified variable in the hallucinations dataset
hallucinations_df.rename(columns={'Please describe as much as you can remember about what you saw in the Ganzflicker:': 'hallucination_description'}, inplace=True)
hallucinations_df.rename(columns={'What is your gender?': 'gender'}, inplace=True)
hallucinations_df.rename(columns={'How old are you?': 'age'}, inplace=True)
hallucinations_df.rename(columns={'How would you describe your VISUAL imagery vividness on a scale from 0-10?': 'visual_vividness'}, inplace=True)
hallucinations_df.rename(columns={'How long did you view the Ganzflicker?': 'view_duration'}, inplace=True)
hallucinations_df.rename(columns={'Did you view the Ganzflicker with the lights in the room turned off?': 'lights'}, inplace=True)
hallucinations_df.rename(columns={'Did you view the Ganzflicker on a computer or mobile phone?': 'computer'}, inplace=True)
hallucinations_df.rename(columns={'How did you find the experience emotionally?': 'emotion'}, inplace=True)
hallucinations_df.rename(columns={'Did you see anything in the Ganzflicker? If not, confirm by answering "no" and then skip to the next section.': 'hallucination_categorical'}, inplace=True)
hallucinations_df.rename(columns={'About how long did it take before images started to emerge?': 'emerge_time'}, inplace=True)
hallucinations_df.rename(columns={'About how frequently did you experience images?': 'hallucination_frequency'}, inplace=True)
hallucinations_df.rename(columns={'If you saw images, how intense were they?': 'hallucination_intensity'}, inplace=True)
hallucinations_df.rename(columns={'If you saw images, how long did a single image last?': 'hallucination_duration'}, inplace=True)
hallucinations_df.rename(columns={'How did your normal state of consciousness change during the experience?': 'consciousness_change'}, inplace=True)
hallucinations_df.rename(columns={'If you felt a change in your state of consciousness, please describe how you felt in more detail:': 'consciousness_description'}, inplace=True)
hallucinations_df.rename(columns={'If you saw images, where did they tend to appear on the screen?': 'hallucination_location'}, inplace=True)


In [142]:
# Preprocess lancaster_df to lowercase the 'Word' column
lancaster_df['Word'] = lancaster_df['Word'].str.lower()

# Set 'Word' as the index for easier lookup
lancaster_df.set_index('Word', inplace=True)

# Initialize the spell checker
spell = SpellChecker()

def autocorrect_words(words):
    # Use the spell checker to correct the words
    corrected_words = [spell.correction(word) for word in words]
    return corrected_words

# Initialize the NLTK lemmatizer
lemmatizer = WordNetLemmatizer()

# Modify the nltk_tag_to_wordnet_tag function to default to NOUN
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return wordnet.NOUN  # Default to noun if the tag is not recognized

# Modify the lemmatize_text function to ensure words are lemmatized to their singular form
def lemmatize_text(text):
    nltk_tagged = pos_tag(word_tokenize(text.lower()))
    wordnet_tagged = [(word, nltk_tag_to_wordnet_tag(tag)) for word, tag in nltk_tagged]
    lemmatized_words = [lemmatizer.lemmatize(word, tag) if tag is not None else word for word, tag in wordnet_tagged]
    return lemmatized_words

# Define the categories to calculate averages for
categories = ['Auditory.mean', 'Gustatory.mean', 'Haptic.mean', 'Interoceptive.mean',
              'Olfactory.mean', 'Visual.mean', 'Foot_leg.mean', 'Hand_arm.mean',
              'Head.mean', 'Mouth.mean', 'Torso.mean', 'Max_strength.perceptual',
              'Max_strength.action', 'Max_strength.sensorimotor']

def calculate_means_and_missing(description, lancaster_df, categories):
    # Tokenize the description
    words = word_tokenize(description.lower())
    
    # Auto-correct the words
    corrected_words = autocorrect_words(words)
    
    # Filter out None values returned by spell checker
    corrected_words = [word for word in corrected_words if word is not None]
    
    # Lemmatize the corrected words
    nltk_tagged = pos_tag(corrected_words)
    wordnet_tagged = [(word, nltk_tag_to_wordnet_tag(tag)) for word, tag in nltk_tagged]
    lemmatized_words = [lemmatizer.lemmatize(word, tag) for word, tag in wordnet_tagged if word is not None]
    
    # Initialize sums, counts, and missing word count
    category_sums = {category: 0 for category in categories}
    category_counts = {category: 0 for category in categories}
    missing_words_count = 0
    
    # Count the values and track missing words
    for word in lemmatized_words:
        if word in lancaster_df.index:
            for category in categories:
                value = lancaster_df.at[word, category]
                if pd.notnull(value):
                    category_sums[category] += value
                    category_counts[category] += 1
        else:
            missing_words_count += 1
    
    # Calculate the average for each category
    category_means = {category: (category_sums[category] / category_counts[category] if category_counts[category] > 0 else 0) for category in categories}
    
    # Calculate the percentage of missing words
    total_words_count = len(lemmatized_words)
    percent_missing = (missing_words_count / total_words_count * 100) if total_words_count > 0 else 0
    
    return category_means, percent_missing

# Note: Ensure that `autocorrect_words`, `lemmatize_text`, and other functions/variables are defined as per previous instructions.


# Apply the function to each row in hallucinations_df['hallucination_description']
results = hallucinations_df['hallucination_description'].apply(
    lambda desc: calculate_means_and_missing(desc, lancaster_df, categories)
)

# Split the results into separate columns
hallucinations_df['category_means'] = results.apply(lambda x: x[0])
hallucinations_df['percent_missing'] = results.apply(lambda x: x[1])

# Separate the category means into individual columns
for category in categories:
    hallucinations_df[category] = hallucinations_df['category_means'].apply(lambda means: means[category])

# Drop the temporary 'category_means' column if it's no longer needed
hallucinations_df.drop('category_means', axis=1, inplace=True)

# Check the result
print(hallucinations_df[['percent_missing'] + categories].head())


   percent_missing  Auditory.mean  Gustatory.mean  Haptic.mean  \
0        33.333333       0.751553        0.127010     1.528874   
1         0.000000       0.866461        0.067344     1.278444   
2         0.000000       1.140625        0.326287     1.590074   
3        20.000000       1.285728        0.237833     0.665874   
4        14.285714       1.615596        0.204281     1.041903   

   Interoceptive.mean  Olfactory.mean  Visual.mean  Foot_leg.mean  \
0            0.718110        0.149123     3.618695       0.487719   
1            0.331334        0.057423     3.793672       0.571560   
2            0.565257        0.458640     4.257353       1.053571   
3            0.624078        0.290684     2.900756       0.673752   
4            0.939920        0.233077     3.144447       0.938628   

   Hand_arm.mean  Head.mean  Mouth.mean  Torso.mean  Max_strength.perceptual  \
0       1.161477   2.528436    0.651023    0.690789                 3.618695   
1       1.631495   2.338191 

In [143]:
hallucinations_df

Unnamed: 0,id,gender,age,visual_vividness,view_duration,lights,computer,emotion,hallucination_categorical,hallucination_description,emerge_time,hallucination_frequency,hallucination_intensity,hallucination_duration,consciousness_change,consciousness_description,hallucination_location,percent_missing,Auditory.mean,Gustatory.mean,Haptic.mean,Interoceptive.mean,Olfactory.mean,Visual.mean,Foot_leg.mean,Hand_arm.mean,Head.mean,Mouth.mean,Torso.mean,Max_strength.perceptual,Max_strength.action,Max_strength.sensorimotor
0,1,Female,21,3,Less than 10min.,Yes,Computer,"Unpleasant, wanted it to stop","Simple shapes or patterns (e.g., ball of light...","alien bust , skull , butterfly",A few seconds,Constantly,"Weak, faint, or insubstantial, Clear, but not ...","1-2 seconds, Constant, morphing from one image...","As if in a dreamlike state, Lost a sense of space",feel be sucked towards screen center,In the center,33.333333,0.751553,0.127010,1.528874,0.718110,0.149123,3.618695,0.487719,1.161477,2.528436,0.651023,0.690789,3.618695,2.681213,3.810362
1,2,Male,,3,10min.,Yes,Computer,"Somewhat enjoyable, but easy to stop","Simple shapes or patterns (e.g., ball of light...",blach rectangular shapes flashing on the screen,1-2 minutes,Frequently,"Clear, but not vivid",Brief moment or flash,Became relaxed or sleepy,,All over the screen,0.000000,0.866461,0.067344,1.278444,0.331334,0.057423,3.793672,0.571560,1.631495,2.338191,0.638268,0.646497,3.809545,2.438441,3.809545
2,3,Male,26,8,10min.,No,Computer,"Somewhat enjoyable, but easy to stop","Complex objects (e.g., animals, faces, buildings)",corridor with bright windows,1-2 minutes,Frequently,"Clear, vivid, and/or bright","Constant, morphing from one image to the next","As if in a dreamlike state, Felt transported t...",,In the center,0.000000,1.140625,0.326287,1.590074,0.565257,0.458640,4.257353,1.053571,1.402381,2.848810,0.530952,0.648214,4.257353,3.003571,4.257353
3,4,Female,42,0,10min.,Yes,Computer,"Somewhat unpleasant, but didn't mind going on","Distortions (e.g., screen warping or rotating)...","different vague patterns, pulsing, the image ...",1-2 minutes,"I saw patterns consistently, but no images",,,Became relaxed or sleepy,I felt sleepy and vaguely nauseous,,20.000000,1.285728,0.237833,0.665874,0.624078,0.290684,2.900756,0.673752,1.059774,2.198787,0.958165,0.656462,2.993348,2.284981,3.085437
4,5,Male,,8,10min.,Yes,Computer,"Somewhat enjoyable, but easy to stop","Simple shapes or patterns (e.g., ball of light...","geometric patterns, as seen in 2D FFT output g...",1-2 minutes,Frequently,Clear and moderately vivid,1-2 seconds,"No change, fully alert in the here and now",,In the center,14.285714,1.615596,0.204281,1.041903,0.939920,0.233077,3.144447,0.938628,1.524497,2.565231,1.243182,0.889192,3.261650,2.749302,3.380846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4360,4361,Female,31,6,10min.,Yes,Computer,"Somewhat unpleasant, but didn't mind going on","Simple shapes or patterns (e.g., ball of light...",yellow orbs or streaks of light that were purp...,A few seconds,Constantly,"Clear, but not vivid","Constant, morphing from one image to the next",Became relaxed or sleepy,,In the center,3.921569,1.086061,0.281178,1.007474,0.700710,0.303501,2.934595,0.660595,1.098057,2.229769,0.789637,0.552142,3.052683,2.344019,3.223252
4361,4362,Male,30,8,Didn't time it,Yes,Computer,"Somewhat unpleasant, but didn't mind going on","Simple shapes or patterns (e.g., ball of light...",Yellows and Greens at the peripheral edges. Mo...,They began to emerge immediately,Constantly,"Very vivid, almost real, or popping out of the...","Constant, morphing from one image to the next",Felt transported to another place,A more meditative arena.,All over the screen,12.359551,1.546978,0.404679,1.059082,1.076898,0.444799,2.815290,0.840831,1.163593,2.411852,1.173889,0.825247,3.172508,2.562897,3.305669
4362,4363,Female,34,10,10min.,Yes,Mobile phone,"Somewhat unpleasant, but didn't mind going on","Simple shapes or patterns (e.g., ball of light...",You know how when they record music they have ...,A few seconds,Constantly,"Very vivid, almost real, or popping out of the...","Constant, morphing from one image to the next",I became tense and felt ill. Next time use a l...,I felt sick uncomfortable and agitated,All over the screen,9.278351,1.590610,0.407626,0.974032,1.121562,0.456395,2.525004,0.793810,1.141065,2.218346,1.116307,0.713135,2.906825,2.306814,3.038455
4363,4364,Female,63,8,Didn't time it,Yes,Mobile phone,"Somewhat enjoyable, but easy to stop","Simple shapes or patterns (e.g., ball of light...",Zebra oval ball spots,1-2 minutes,Frequently,Clear and moderately vivid,1-2 seconds,"No change, fully alert in the here and now",,"In the center, In the periphery",0.000000,0.789986,0.049627,1.840920,0.150047,0.375794,4.137232,0.845238,1.994048,2.546627,0.606151,0.412698,4.137232,3.046627,4.157540
