# Use of Words across Age Span in the United States

## Environment Setting

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from collections import Counter
import nltk
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
from textblob import TextBlob
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

# Goal of the project: 
## 1. remove all the stopping words and normalize the age frame
## 2. Focus on individuals from the US and divide genders into males and females
## 3. Merge the dataset between demography and cleaned_hm. 
## 4. Calculate the percentage of frequency on use of sentiment words based on males and females.
## 5. Extract word values based on sentiment words by percentage of frequency and set target words.
## 6. Line-chart visualization on frequency of words value by age between male and female. 
## 7. Insights from some target words according to certain ages

## Data Manipulation for age normalization and removal of stop words

### 1. Text Simple Preprocessing

In [2]:
def simple_preprocess(text):
    """
    Perform simple preprocessing on the given text.
    - Convert text to lowercase.
    - Remove non-alphabetic characters, keeping only letters and spaces.
    - Split text into individual words.
    
    Parameters:
    - text (str): The text to be preprocessed.
    
    Returns:
    - list: A list of preprocessed words from the text.
    """
    text = text.lower()  # Convert text into lowercase
    text = ''.join(char for char in text if char.isalpha() or char.isspace())  # Remove non-alphabetic characters
    words = text.split()  # Split text into words
    return words

### 2. Removal of Stop Words

In [3]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('vader_lexicon')

def process_data(data_path, age_column, text_column, gender_prefix):
    """
    Process data by performing text preprocessing, removing stop words, sentiment analysis setup, and age normalization.
    
    Parameters:
    - data_path (str): Path to the CSV file.
    - age_column (str): Name of the column for age data.
    - text_column (str): Name of the column for text data.
    - gender_prefix (str): Prefix for gender-specific processing.
    
    Returns:
    - Tuple: Processed DataFrame, most common filtered words, sentiment word frequency by age.
    """
    data_df = pd.read_csv(data_path)
    data_df[f'{gender_prefix}_processed_text'] = data_df[text_column].apply(simple_preprocess)

    # Remove stop words
    all_words = [word for text in data_df[f'{gender_prefix}_processed_text'] for word in text]
    english_stopwords = set(stopwords.words('english'))
    filtered_words = [word for word in all_words if word not in english_stopwords]
    filtered_word_freq = Counter(filtered_words)
    most_common_filtered_words = filtered_word_freq.most_common(20)

    # Sentiment analysis setup
    sia = SentimentIntensityAnalyzer()
    sentiment_words_from_filtered = [word for word in filtered_words if word in sia.lexicon]
    unique_sentiment_words = list(set(sentiment_words_from_filtered))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jacksonzhao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jacksonzhao/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### 3. Age Normalization and Sentiment Word frequency Combination

In [4]:
def normalize_age(age):
    """
    Normalize age values by converting them to integers, or return None for invalid inputs.
    
    Parameters:
    - age (str): Age value to be normalized.
    
    Returns:
    - int or None: Normalized age as an integer, or None for invalid inputs.
    """
    try:
        return int(float(age))
    except (ValueError, TypeError):
        return None

# Continue within the process_data function for age normalization application
    data_df[f'normalized_age_{gender_prefix}'] = data_df[age_column].apply(normalize_age)
    valid_age_data_df = data_df.dropna(subset=[f'normalized_age_{gender_prefix}'])

# Group texts by normalized age
    grouped_texts = valid_age_data_df.groupby(f'normalized_age_{gender_prefix}')[f'{gender_prefix}_processed_text']
    age_grouped_text_valid = grouped_texts.apply(lambda texts: ' '.join(' '.join(text) for text in texts))

    # Word frequency analysis by valid age groups
    word_freq_by_valid_age = {}
    for age, text in age_grouped_text_valid.items():
        words = text.split()
        filtered_words = [word for word in words if word not in english_stopwords]
        word_freq = Counter(filtered_words)
        word_freq_by_valid_age[age] = word_freq

    # Sentiment word frequency analysis by age
    sentiment_word_freq_by_valid_age = pd.DataFrame({
        word: [word_freq_by_valid_age[age][word] for age in word_freq_by_valid_age] for word in unique_sentiment_words
    }, index=word_freq_by_valid_age.keys())

    return data_df, most_common_filtered_words, sentiment_word_freq_by_valid_age

## - Manipulate data and separated by gender

### 1. Data Loading and Merging

In [5]:
# Define the path for input and output data
input_path = '../data/'
output_path = '../output/'

# Load demographic and cleaned happiness moments (hm) data
demographic_df = pd.read_csv(f'{input_path}demographic.csv')
cleaned_hm_df = pd.read_csv(f'{input_path}cleaned_hm.csv')

# Merge the datasets on 'wid' (writer ID) and filter for US participants
merged_data = pd.merge(cleaned_hm_df, demographic_df, on='wid', how='inner')
us_data = merged_data[merged_data['country'] == 'USA']

### 2. Gender-specific Data Segregation and Saving

In [6]:
for gender in ['m', 'f']:
    gender_data = us_data[us_data['gender'] == gender]
    gender_data.to_csv(f'{output_path}{gender}_data.csv', index=False)

### 3. Sentiment Word Frequency Analysis

In [7]:
# Process the data for male and female participants
processed_data = {}
for gender_prefix in ['male', 'female']:
    file_path = f'{output_path}{gender_prefix[0]}_data.csv'
    processed_data[gender_prefix] = process_data(file_path, 'age', 'cleaned_hm', gender_prefix)

# Calculate sentiment word frequency percentages for each gender
sentiment_word_freq_percentages = {}
for gender_prefix, (data_df, most_common, sentiment_word_freq_by_valid_age) in processed_data.items():
    total_words_per_age_group = sentiment_word_freq_by_valid_age.sum(axis=1)
    sentiment_word_freq_percentages[gender_prefix] = sentiment_word_freq_by_valid_age.div(total_words_per_age_group, axis=0) * 100

# Combine the lists of words from both genders into a single set to eliminate duplicates
words = list(set(
    sentiment_word_freq_percentages['male'].columns.to_list() +
    sentiment_word_freq_percentages['female'].columns.to_list()
))

TypeError: cannot unpack non-iterable NoneType object

In [None]:
# Step 1: Rename columns except for 'age'
sentiment_word_freq_percentages_male.columns = [f'{col}_male' if col != 'age' else col for col in sentiment_word_freq_percentages_male.columns]
sentiment_word_freq_percentages_female.columns = [f'{col}_female' if col != 'age' else col for col in sentiment_word_freq_percentages_female.columns]

# Step 2: Set 'age' as index for both DataFrames
sentiment_word_freq_percentages_male.index.name = 'Age'
sentiment_word_freq_percentages_female.index.name = 'Age'

# Step 3: Perform an outer merge on the index (age)
merged_df = pd.merge(sentiment_word_freq_percentages_male, sentiment_word_freq_percentages_female, left_index=True, right_index=True, how='outer')

In [None]:
# Step 1: Reset index to convert 'age' into a column
merged_df.reset_index(inplace=True)
# Show the modified DataFrame
merged_df['Age'] = merged_df['Age'].astype(int)
merged_df

In [None]:
def plot_word_frequency(word, merged_df, output_directory='../figs/word_frequency_compare/'):
    """
    Plots word frequency comparison by age and gender.

    Parameters:
    - word (str): The word to plot frequencies for.
    - merged_df (DataFrame): The DataFrame containing word frequencies.
    - output_directory (str): Directory path to save the output plots.
    """

    # Ensure the word columns exist in the DataFrame; if not, initialize them to 0
    for gender in ['male', 'female']:
        column_name = f'{word}_{gender}'
        if column_name not in merged_df.columns:
            merged_df[column_name] = 0

    # Prepare data for plotting, filling NaN values with 0
    plot_data = merged_df[['Age', f'{word}_male', f'{word}_female']].fillna(0)

    # Set figure size for better visibility
    plt.figure(figsize=(14, 6))  # Wider chart for clarity

    # Positions of bars on the x-axis and bar width
    bar_width = 0.35
    r1 = range(len(plot_data))
    r2 = [x + bar_width for x in r1]

    # Making the plot
    plt.bar(r1, plot_data[f'{word}_male'], color='blue', width=bar_width, edgecolor='grey', label='Male')
    plt.bar(r2, plot_data[f'{word}_female'], color='red', width=bar_width, edgecolor='grey', label='Female')

    # Add labels, title, and customize x-axis ticks
    plt.xlabel('Age', fontweight='bold')
    plt.xticks([r + bar_width / 2 for r in r1], plot_data['Age'].astype(str), rotation='vertical')
    plt.ylabel(f'Frequency of "{word.capitalize()}"', fontweight='bold')
    plt.title(f'Comparison of "{word.capitalize()}" Frequency by Age and Gender', fontweight='bold')

    plt.legend()
    plt.tight_layout()  # Adjust layout

    # Check and create output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Save the plot
    plt.savefig(os.path.join(output_directory, f'{word}.png'))
    plt.close()  # Close the plot to free up memory

# Example usage, assuming 'merged_df' and 'words' are defined
for word in words:
    plot_word_frequency(word, merged_df)

#### Extract top 10 words for each age

In [None]:
# Re-process the merged data with corrected approach for basic stopwords
word_freq_percentage_by_age_corrected_total = process_data_for_age_with_basic_stopwords_corrected(merged_data, 'cleaned_hm', 'age')
word_freq_percentage_by_age_corrected_male = process_data_for_age_with_basic_stopwords_corrected(male_data, 'cleaned_hm', 'age')
word_freq_percentage_by_age_corrected_female = process_data_for_age_with_basic_stopwords_corrected(female_data, 'cleaned_hm', 'age')

In [None]:
def extract_word_values(word_freq_data, target_word):
    """
    Extracts and returns the values associated with the target word for each age group
    in the provided data structure.

    Parameters:
    - word_freq_data: A dictionary with age groups as keys and lists of (word, value) tuples as values.
    - target_word: The word for which values are to be extracted across all age groups.

    Returns:
    - results_df: A pandas DataFrame with two columns: 'Age' and '{target_word} Value', where each row corresponds
      to an age group and its value for the target word. If the target word is not present, the value will be None.
    """
    # Initialize an empty dictionary to store the results
    results = {}
    
    # Iterate over each age group in the data
    for age, word_values in word_freq_data.items():
        # Initialize the value for the target word as None for each age group
        value_for_target_word = None
        
        # Search for the target word entry
        for word, value in word_values:
            if word == target_word:
                value_for_target_word = value
                break  # Stop searching once the target word is found
        
        # Assign the found value or None to the results dictionary
        results[age] = value_for_target_word

    # Convert the dictionary to a DataFrame and dynamically name the 'Value' column based on the target word
    results_df = pd.DataFrame(list(results.items()), columns=['Age', f'{target_word} Value'])

    # Return the DataFrame
    return results_df

def merge_word_values_by_age(word_freq_data, words):
    """
    Merges the values for a list of words across age groups into a single DataFrame.

    Parameters:
    - word_freq_data: A dictionary with age groups as keys and lists of (word, value) tuples as values.
    - words: A list of words to extract and merge values for.

    Returns:
    - merged_results_df: A pandas DataFrame with age groups as rows and each word's values as columns.
    """
    # Initialize an empty DataFrame to hold the merged results
    merged_results_df = pd.DataFrame()

    # Iterate over each word to extract its values and merge the results
    for word in words:
        # Extract values for the current word
        results_df = extract_word_values(word_freq_data, word)
        
        # Rename the column to reflect the current word's values
        results_df.rename(columns={f'{word} Value': f'{word}_value'}, inplace=True)
        
        # If it's the first word, initialize the merged DataFrame with the age and word's value
        if merged_results_df.empty:
            merged_results_df = results_df
        else:
            # For subsequent words, merge on 'Age' to ensure alignment across age groups
            merged_results_df = pd.merge(merged_results_df, results_df, on='Age', how='outer')

    # Return the final merged DataFrame
    return merged_results_df

### Extract word frequency table for total, male, and female

In [None]:
word_frequency_total = merge_word_values_by_age(word_freq_percentage_by_age_corrected_total, words)
word_frequency_male = merge_word_values_by_age(word_freq_percentage_by_age_corrected_male, words)
word_frequency_female = merge_word_values_by_age(word_freq_percentage_by_age_corrected_female, words)

In [None]:
word_frequency_female