## Reading in and Processing Data

In [None]:
import pandas as pd

# read the spreadsheet into a pandas dataframe
df = pd.read_csv('LIWCDATA-4.10.24.csv')

# filter the DataFrame by father and mother
father_df = df[df['Father (0) or Mother(1)'] == 0]
mother_df = df[df['Father (0) or Mother(1)'] == 1]

columns = list(father_df.columns[8:])

mean_values = {'Columns': columns}

father_means = [father_df[col].mean() for col in columns]
mother_means = [mother_df[col].mean() for col in columns]

father_sd = [father_df[col].std() for col in columns]
mother_sd = [mother_df[col].std() for col in columns]

mean_values['Father Mean'] = father_means
mean_values['Mother Mean'] = mother_means

mean_values['Father SD'] = father_sd
mean_values['Mother SD'] = mother_sd

mean_sd_df = pd.DataFrame(mean_values)

print(mean_sd_df)

In [None]:
# getting sample sizes
print('Number of father entries:', len(father_df))
print('Number of mother entries:', len(mother_df))

In [None]:
# make columns into a list for printing in each loop iteration during t test
columns_list = mean_sd_df['Columns'].tolist()

## T-Test

In [None]:
# Welch T-test
import numpy as np
from scipy.stats import t

# initializing list for tests that are statistically significant
stat_diff_list = []

# drop the 'Columns' column as it's not needed for the test
mean_sd_df.drop(columns='Columns', inplace=True)

def welch_t_test(mean1, sd1, n1, mean2, sd2, n2):
    # standard error of the difference between means
    se_diff = np.sqrt((sd1**2 / n1) + (sd2**2 / n2))
    
    # t-statistic calc
    t_statistic = (mean1 - mean2) / se_diff
    
    # degrees of freedom
    df = n1 + n2 - 2
    
    # calculate the p-value
    p_value = 2 * (1 - t.cdf(np.abs(t_statistic), df))
    
    # Cohen's d calculation
    pooled_sd = np.sqrt(((n1 - 1) * sd1 ** 2 + (n2 - 1) * sd2 ** 2) / (n1 + n2 - 2))
    cohens_d = (mean1 - mean2) / pooled_sd

    return t_statistic, p_value, cohens_d

# iterate over each row (each category from LIWC)
for index, row in mean_sd_df.iterrows():
    col_name = columns_list[index]  # name of the row as the column name
    father_mean, father_sd = row['Father Mean'], row['Father SD']
    mother_mean, mother_sd = row['Mother Mean'], row['Mother SD']
    father_n, mother_n = 240, 359  # sample sizes for father and mother
    
    # perform t-test
    t_statistic, p_value, cohens_d = welch_t_test(father_mean, father_sd, father_n, mother_mean, mother_sd, mother_n)
    
    # print t_statistic and p_value for each LIWC category t test
    print('For', col_name)
    print('t-statistic:', t_statistic)
    print('p-value:', p_value)
    print("Cohen's d:", cohens_d)

    # t-test evaluation
    
    alpha = 0.05
    if p_value < alpha:
        print('Samples are different (reject null hypothesis)')
        stat_diff_list.append(col_name)

    else:
        print('Samples are not different (fail to reject null hypothesis)')
    print()

## Word Clouds

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords

df = pd.read_csv('LIWCDATA-4.10.24.csv')

# filter the dataframe by father and mother
father_df = df[df['Father (0) or Mother(1)'] == 0]
mother_df = df[df['Father (0) or Mother(1)'] == 1]

# preprocessing function
def preprocess_text(text):
    # remove non-alphanumeric characters except spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # convert to lowercase
    text = text.lower()
    
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    
    # split the text into words using whitespace as delimiter
    words = text.split()
    
    # filter out stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)
    
    return filtered_text

# generate word cloud function
def generate_wordcloud(text):
    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white', 
                    min_font_size = 10).generate(text)
    return wordcloud

# concatenate all texts for mothers and fathers
all_mother_text = ' '.join(preprocess_text(text) for text in mother_df['Text'])
all_father_text = ' '.join(preprocess_text(text) for text in father_df['Text'])

# generate word clouds
mother_wordcloud = generate_wordcloud(all_mother_text)
father_wordcloud = generate_wordcloud(all_father_text)

# plotting
plt.figure(figsize = (10, 8))
plt.subplot(1, 2, 1)
plt.imshow(mother_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Mothers')
plt.axis('off')
plt.subplot(1, 2, 2)
plt.imshow(father_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Fathers')
plt.axis('off')
plt.show()

## Top 10 NGrams for Each Group

In [None]:
from collections import Counter
from nltk import word_tokenize
from nltk.util import ngrams

# concatenate all texts for mothers and fathers
all_mother_text = ' '.join(preprocess_text(text) for text in mother_df['Text'])
all_father_text = ' '.join(preprocess_text(text) for text in father_df['Text'])

# tokenize the text
mother_tokens = word_tokenize(all_mother_text)
father_tokens = word_tokenize(all_father_text)

# Get English stopwords
stop_words = set(stopwords.words('english'))

# Filter out stopwords
mother_tokens = [word for word in mother_tokens if word.lower() not in stop_words]
father_tokens = [word for word in father_tokens if word.lower() not in stop_words]

# get unigrams
mother_unigrams = Counter(mother_tokens)
father_unigrams = Counter(father_tokens)

# Get bigrams
mother_bigrams = Counter(ngrams(mother_tokens, 2))
father_bigrams = Counter(ngrams(father_tokens, 2))

# Extract most frequent unigrams (top 10)
top_unigrams_M = mother_unigrams.most_common(10)
top_unigrams_F = father_unigrams.most_common(10)

# Extract most frequent bigrams (top 10)
top_bigrams_M = mother_bigrams.most_common(10)
top_bigrams_F = father_bigrams.most_common(10)

print("Top 10 most frequent unigrams in Mothers:")
for unigram, count in top_unigrams_M:
    print(f"{unigram}: {count}")

print("\nTop 10 most frequent bigrams in Mothers:")
for bigram, count in top_bigrams_M:
    print(f"{' '.join(bigram)}: {count}")

print("\nTop 10 most frequent unigrams in Fathers:")
for unigram, count in top_unigrams_F:
    print(f"{unigram}: {count}")

print("\nTop 10 most frequent bigrams in Fathers:")
for bigram, count in top_bigrams_F:
    print(f"{' '.join(bigram)}: {count}")