# Feature Engineering

In [6]:
import pandas as pd
import string
import nltk
import math
from nltk.corpus import stopwords

# Download stopwords if not already present
nltk.download('stopwords')

# Load the dataset
file_path = "sentiment_data.csv"  # Update the file path if needed
df = pd.read_csv(file_path)

# Ensure all statements are strings
df["statement"] = df["statement"].astype(str)

# Step 1: Fix sentiment column name
print("\nIdentifying the correct sentiment column...")
if "sentiment" not in df.columns and "sentiment_score" in df.columns:
    sentiment_column = "sentiment_score"
else:
    sentiment_column = "sentiment"
print(f"Using column: {sentiment_column} for sentiment scores.\n")

# Step 2: Categorize sentiment scores
print("Categorizing sentiment scores into Positive, Neutral, and Negative...")
def categorize_sentiment(score):
    if score < -0.1:
        return "negative"
    elif score > 0.1:
        return "positive"
    else:
        return "neutral"

df["sentiment_category"] = df[sentiment_column].apply(categorize_sentiment)
print("Sentiment categorization complete.\n")

# Step 3: Feature Engineering - Compute before IQR filtering
print("Computing text features...")

df["text_length"] = df["statement"].apply(len)
df["word_count"] = df["statement"].apply(lambda x: len(x.split()))
df["avg_word_length"] = df["statement"].apply(lambda x: math.ceil(sum(len(word) for word in x.split()) / (len(x.split()) + 1e-5)))
# Stopword Count
stop_words = set(stopwords.words('english'))
df["stopword_count"] = df["statement"].apply(lambda x: sum(1 for word in x.split() if word.lower() in stop_words))

# First-Person Pronoun Count
first_person_pronouns = {'i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves'}
df["first_person_pronoun_count"] = df["statement"].apply(lambda x: sum(1 for word in x.lower().split() if word in first_person_pronouns))

print("Text features computed successfully.\n")

# Step 4: Keyword Matching Feature

# These keywords were selected based on three sources
print("Matching statements with mental health-related keywords...")

keywords = [
    'anxiety', 'anxious', 'tiredness', 'tired', 'fatigue', 'sadness', 'sad', 'concentrate', 'concentrating', 'trembling',
    'depression', 'ocd', 'adhd', 'bipolar', 'psychosis', 'disorder', 'ptsd', 'eating',
    'flashbacks', 'panic', 'panic attack', 'irritable', 'irritability', 'avoid', 'avoidance', 'avoiding', 'suicidal', 'trauma', 'traumatic', 'nightmares'
]

def match_keywords(statement, keyword_list):
    found_keywords = [word for word in keyword_list if word in statement]  # Check which keywords are present
    return ', '.join(found_keywords), len(found_keywords)  # Return keyword list and count

df[['keywords_found', 'keyword_count']] = df['statement'].apply(lambda x: pd.Series(match_keywords(x.lower(), keywords)))

print("Keyword extraction complete.\n")

# Capture initial statistics before outlier removal
print("Capturing statistics before removing outliers...")
initial_text_length_min = df["text_length"].min()
initial_text_length_max = df["text_length"].max()
initial_word_count_min = df["word_count"].min()
initial_word_count_max = df["word_count"].max()
initial_row_count = df.shape[0]
print(f"Initial row count: {initial_row_count}")
print(f"   Text Length Before Outlier Removal: Min = {initial_text_length_min}, Max = {initial_text_length_max}")
print(f"   Word Count Before Outlier Removal: Min = {initial_word_count_min}, Max = {initial_word_count_max}\n")

# Step 5: Removing outliers using IQR for text length and word count
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)], lower_bound, upper_bound

print("Removing outliers using IQR method...")

df, text_length_lower, text_length_upper = remove_outliers_iqr(df, "text_length")
df, word_count_lower, word_count_upper = remove_outliers_iqr(df, "word_count")

# Capture final statistics after outlier removal
final_text_length_min = df["text_length"].min()
final_text_length_max = df["text_length"].max()
final_word_count_min = df["word_count"].min()
final_word_count_max = df["word_count"].max()
final_row_count = df.shape[0]

# Step 6: Print Changes After Outlier Removal
print("Outliers removed successfully.\n")
print(f"FINAL STATISTICS AFTER OUTLIER REMOVAL:")
print(f"   Rows before removal: {initial_row_count}, Rows after removal: {final_row_count}, Rows removed: {initial_row_count - final_row_count}")
print(f"   Text Length IQR Bounds: [{text_length_lower}, {text_length_upper}]")
print(f"   Text Length After Removal: Min = {final_text_length_min}, Max = {final_text_length_max}")
print(f"   Word Count IQR Bounds: [{word_count_lower}, {word_count_upper}]")
print(f"   Word Count After Removal: Min = {final_word_count_min}, Max = {final_word_count_max}\n")

# Step 7: Save the feature-engineered dataset
feature_engineered_file_path = "feature_engineered_data.csv"
df.to_csv(feature_engineered_file_path, index=False)
print(f"Feature engineering complete. Dataset saved as '{feature_engineered_file_path}'.")

# Display first few rows
df.head()

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Identifying the correct sentiment column...
Using column: sentiment_score for sentiment scores.

Categorizing sentiment scores into Positive, Neutral, and Negative...
Sentiment categorization complete.

Computing text features...
Text features computed successfully.

Matching statements with mental health-related keywords...
Keyword extraction complete.

Capturing statistics before removing outliers...
Initial row count: 51093
   Text Length Before Outlier Removal: Min = 2, Max = 31499
   Word Count Before Outlier Removal: Min = 1, Max = 6300

Removing outliers using IQR method...
Outliers removed successfully.

FINAL STATISTICS AFTER OUTLIER REMOVAL:
   Rows before removal: 51093, Rows after removal: 46147, Rows removed: 4946
   Text Length IQR Bounds: [-897.5, 1698.5]
   Text Length After Removal: Min = 2, Max = 1669
   Word Count IQR Bounds: [-146.5, 281.5]
   Word Count After Removal: Min = 1, Max = 281

Feature engineering complete. Dataset saved as 'feature_engineered_data.csv'.

Unnamed: 0,statement,status,sentiment_score,sentiment_category,text_length,word_count,avg_word_length,stopword_count,first_person_pronoun_count,keywords_found,keyword_count
0,oh my gosh,Anxiety,0.0,neutral,10,3,3,1,1,,0
1,trouble sleeping confused mind restless heart ...,Anxiety,-0.7269,negative,61,10,6,3,0,,0
2,all wrong back off dear forward doubt stay in ...,Anxiety,-0.7351,negative,75,14,5,5,0,,0
3,ive shifted my focus to something else but im ...,Anxiety,-0.4215,negative,59,11,5,3,1,,0
4,im restless and restless its been a month now ...,Anxiety,-0.4939,negative,66,14,4,8,0,,0
