# **Importing Libraries**

In [None]:
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import plotly.express as px
import numpy as np
nltk.download('punkt')
nltk.download('punkt_tab')

# **Loading Data**

In [None]:
csv_file = "mental_health_reddit_posts.csv"
df = pd.read_csv(csv_file)
results = {}

# **Topic-based regional analysis**

In [None]:
def regional_topic_analysis(df, text_column="preprocessed_content"):
    """
    Group posts by major geographic regions (Northeast, Midwest, etc.)
    and analyze topic differences between regions.
    """
    print("Performing regional topic analysis...")

    # Define regions (a simplified approach without precise geocoding)
    us_regions = {
        "Northeast": ["NY", "NYC", "New York", "Boston", "Philadelphia", "NJ", "Jersey", "CT", "Maine", "NH", "VT", "RI", "MA"],
        "Midwest": ["Chicago", "IL", "Michigan", "Ohio", "Minnesota", "Wisconsin", "Iowa", "Missouri", "Indiana", "Detroit"],
        "South": ["TX", "Texas", "FL", "Florida", "Georgia", "NC", "SC", "Tennessee", "Kentucky", "Alabama", "Louisiana", "Virginia"],
        "West": ["CA", "California", "LA", "Oregon", "Washington", "CO", "Colorado", "Arizona", "Nevada", "Utah", "Portland", "Seattle"],
        "International": ["UK", "London", "Canada", "Australia", "Germany", "France", "Europe", "Asia", "Africa", "Mexico"]
    }

    # Initialize region counters
    region_counts = {region: 0 for region in us_regions}
    region_texts = {region: [] for region in us_regions}

    # Scan posts for region mentions
    for _, row in df.iterrows():
        text = str(row[text_column])

        # Assign to region based on keyword matches
        for region, keywords in us_regions.items():
            for keyword in keywords:
                if re.search(r'\b' + re.escape(keyword) + r'\b', text, re.IGNORECASE):
                    region_counts[region] += 1
                    region_texts[region].append(text)
                    break

    # Analyze common words in each region
    region_top_words = {}
    for region, texts in region_texts.items():
        if texts:
            all_text = " ".join(texts)
            words = nltk.word_tokenize(all_text.lower())
            fdist = nltk.FreqDist(words)
            region_top_words[region] = [word for word, freq in fdist.most_common(20)
                                      if len(word) > 3 and word.isalpha()]

    return region_counts, region_top_words

In [None]:
region_counts, region_topics = regional_topic_analysis(df)
# Regional distribution
print("\n Regional distribution of posts:")
for region, count in region_counts.items():
    print(f"  {region}: {count} posts")

print("\n Top topics by region:")
for region, words in region_topics.items():
    if words:
        print(f"  {region}: {', '.join(words[:10])}")

Performing regional topic analysis...

 Regional distribution of posts:
  Northeast: 8 posts
  Midwest: 0 posts
  South: 6 posts
  West: 7 posts
  International: 22 posts

 Top topics by region:
  Northeast: panic, like, feel, attacks, anxiety, time, going, heart, symptoms, also
  South: help, dont, time, really, told, still, everything, know, years, like
  West: dont, like, time, anxiety, life, help, know, feel, really, much
  International: dont, like, know, time, years, want, feel, would, alcohol, fucking


# **US State Frequency Analysis with Visualization**

In [None]:
# State abbreviations and full names
states = {
        "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas",
        "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware",
        "FL": "Florida", "GA": "Georgia", "HI": "Hawaii", "ID": "Idaho",
        "IL": "Illinois", "IN": "Indiana", "IA": "Iowa", "KS": "Kansas",
        "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland",
        "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi",
        "MO": "Missouri", "MT": "Montana", "NE": "Nebraska", "NV": "Nevada",
        "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico", "NY": "New York",
        "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio", "OK": "Oklahoma",
        "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina",
        "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah",
        "VT": "Vermont", "VA": "Virginia", "WA": "Washington", "WV": "West Virginia",
        "WI": "Wisconsin", "WY": "Wyoming", "DC": "District of Columbia"
}

In [None]:
# US State Frequency Analysis with Visualization

def us_state_analysis(df, text_column="preprocessed_content"):
    """
    Analyze frequency of US state mentions and visualize on a map.
    """
    print("Analyzing US state mentions...")


    # Initialize state counts
    state_counts = {state: 0 for state in states.values()}

    # Count state mentions in posts
    for _, row in df.iterrows():
        text = str(row[text_column])

        # Check for full state names
        for abbr, state in states.items():
            # Check for full state name
            if re.search(r'\b' + re.escape(state) + r'\b', text, re.IGNORECASE):
                state_counts[state] += 1

            # Check for abbreviation with word boundaries
            if re.search(r'\b' + re.escape(abbr) + r'\b', text):
                state_counts[state] += 1

    return state_counts

In [None]:
state_counts = us_state_analysis(df)
print("\n Top 5 locations with the highest crisis discussions:")
sorted_states = sorted(state_counts.items(),
                      key=lambda x: x[1], reverse=True)
for state, count in sorted_states[:5]:
    if count > 0:
        print(f"  {state}: {count} mentions")


Analyzing US state mentions...

 Top 5 locations with the highest crisis discussions:
  Oklahoma: 7 mentions
  Connecticut: 3 mentions
  Florida: 3 mentions
  California: 2 mentions
  Louisiana: 2 mentions


# **Generating a heatmap**

In [None]:
import plotly.express as px

df = pd.DataFrame(sorted_states, columns=['State', 'Value'])


# Add state abbreviations to the DataFrame
state_to_abbr = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
    'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
    'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
    'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
    'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
    'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
    'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
    'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
    'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
    'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
    'Wisconsin': 'WI', 'Wyoming': 'WY', 'District of Columbia': 'DC'
}

df['Code'] = df['State'].map(state_to_abbr)

# Create map with abbreviations
fig = px.choropleth(
    df,
    locations='Code',
    locationmode='USA-states',
    color='Value',
    scope="usa",
    color_continuous_scale="Viridis",
    title='US State Values Heat Map',
    hover_name='State',
    hover_data={'Value': True, 'State': False, 'Code': False}
)

fig.show()