In [1]:
#Using spacy for NER
import spacy
import pandas as pd
import re
import os

In [2]:
df = pd.read_csv("./../../data/final_reddit_data.csv")
df.head(1)

Unnamed: 0,title,score,id,subreddit,url,num_comments,num_upvotes,selftext,created,vader_title_sentiment,vader_selftext_sentiment,vader_sentiment,final_risk_score,risk_classification
0,mom died 3 hour ago,2861,kbqsnq,SuicideWatch,https://www.reddit.com/r/SuicideWatch/comments...,161,2861,thought id ready day realized thats impossible...,2020-12-12 15:22:24,-0.5574,0.846,positive,3.34,Medium Risk


In [6]:
import pandas as pd
import numpy as np
import re
import spacy
import geocoder
from collections import Counter
import folium
from folium.plugins import HeatMap
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import time

# Load the data
def load_data(file_path):
    print("Loading data...")
    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} posts")
    return df

# Extract locations using spaCy's named entity recognition
def extract_locations_spacy(df):
    print("Loading spaCy model...")
    # Use a more accurate model for NER
    nlp = spacy.load("en_core_web_lg")
    
    print("Extracting locations using spaCy...")
    locations = []
    
    # Combine title and selftext for processing
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        combined_text = f"{row['title']} {row['selftext']}"
        
        # Process text with spaCy
        doc = nlp(combined_text)
        
        # Extract location entities (GPE, LOC)
        loc_entities = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]
        
        if loc_entities:
            # Associate locations with risk score
            for loc in loc_entities:
                locations.append({
                    'post_id': row['id'],
                    'location': loc,
                    'risk_score': row['final_risk_score'],
                    'risk_class': row['risk_classification']
                })
    
    # Create DataFrame from extracted locations
    loc_df = pd.DataFrame(locations)
    print(f"Found {len(loc_df)} location mentions in {len(set(loc_df['post_id']))} posts")
    return loc_df

# Alternative location extraction using regex and gazetteer approach
def extract_locations_regex(df):
    print("Extracting locations using regex patterns...")
    
    # List of common locations to check (countries, states, major cities)
    common_locations = [
        # Major US cities
        'new york', 'los angeles', 'chicago', 'houston', 'phoenix', 'philadelphia',
        'san antonio', 'san diego', 'dallas', 'austin', 'seattle', 'boston', 'miami',
        'denver', 'atlanta', 'portland', 'san francisco', 'nashville', 'baltimore',
        # US states
        'california', 'texas', 'florida', 'new york state', 'pennsylvania', 'illinois', 
        'ohio', 'georgia', 'michigan', 'north carolina', 'new jersey',
        # Countries
        'usa', 'united states', 'america', 'canada', 'mexico', 'uk', 'united kingdom',
        'england', 'australia', 'india', 'germany', 'france', 'japan', 'china',
        # Regions
        'europe', 'asia', 'africa', 'south america', 'middle east'
    ]
    
    # Compile regex patterns for each location (word boundary to avoid partial matches)
    patterns = {loc: re.compile(r'\b' + re.escape(loc) + r'\b', re.IGNORECASE) for loc in common_locations}
    
    locations = []
    
    # Process each post
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        combined_text = f"{row['title']} {row['selftext']}"
        
        # Check for location matches
        for loc, pattern in patterns.items():
            if pattern.search(combined_text):
                locations.append({
                    'post_id': row['id'],
                    'location': loc,
                    'risk_score': row['final_risk_score'],
                    'risk_class': row['risk_classification']
                })
    
    # Create DataFrame from extracted locations
    loc_df = pd.DataFrame(locations)
    print(f"Found {len(loc_df)} location mentions in {len(set(loc_df['post_id']))} posts")
    return loc_df

# Geocode locations to get coordinates
def geocode_locations(loc_df):
    print("Geocoding locations...")
    
    # Group by location to avoid redundant geocoding
    unique_locations = loc_df['location'].unique()
    
    # Dictionary to store geocoded results
    geo_dict = {}
    
    for loc in tqdm(unique_locations):
        try:
            # Use geocoder to get coordinates
            g = geocoder.arcgis(loc)
            if g.ok:
                geo_dict[loc] = {
                    'lat': g.lat,
                    'lng': g.lng,
                    'address': g.address
                }
            else:
                # Fallback for some common locations
                if loc.lower() == 'uk' or loc.lower() == 'united kingdom':
                    geo_dict[loc] = {'lat': 51.5074, 'lng': -0.1278, 'address': 'United Kingdom'}
                elif loc.lower() == 'usa' or loc.lower() == 'united states' or loc.lower() == 'america':
                    geo_dict[loc] = {'lat': 39.8283, 'lng': -98.5795, 'address': 'United States'}
                elif loc.lower() == 'europe':
                    geo_dict[loc] = {'lat': 48.8566, 'lng': 2.3522, 'address': 'Europe'}
                else:
                    print(f"Could not geocode: {loc}")
            
            # Add a small delay to avoid rate limiting
            time.sleep(0.2)
            
        except Exception as e:
            print(f"Error geocoding {loc}: {e}")
    
    # Add coordinates to the location dataframe
    geocoded_df = loc_df.copy()
    geocoded_df['lat'] = geocoded_df['location'].map(lambda x: geo_dict.get(x, {}).get('lat'))
    geocoded_df['lng'] = geocoded_df['location'].map(lambda x: geo_dict.get(x, {}).get('lng'))
    geocoded_df['full_address'] = geocoded_df['location'].map(lambda x: geo_dict.get(x, {}).get('address'))
    
    # Remove rows with failed geocoding
    geocoded_df = geocoded_df.dropna(subset=['lat', 'lng'])
    
    print(f"Successfully geocoded {len(geocoded_df)} out of {len(loc_df)} location mentions")
    return geocoded_df

# Create a folium heatmap
def create_folium_heatmap(geocoded_df):
    print("Creating Folium heatmap...")
    
    # Create a map centered on the average coordinates
    center_lat = geocoded_df['lat'].mean()
    center_lng = geocoded_df['lng'].mean()
    
    m = folium.Map(location=[center_lat, center_lng], zoom_start=3)
    
    # Prepare data for heatmap
    # Weight by risk score
    heat_data = [[row['lat'], row['lng'], row['risk_score']] 
                 for _, row in geocoded_df.iterrows()]
    
    # Add heatmap layer
    HeatMap(heat_data, radius=15, blur=10, gradient={0.4: 'blue', 0.65: 'lime', 0.8: 'orange', 1: 'red'}).add_to(m)
    
    # Add markers for top locations
    top_locations = geocoded_df.groupby(['location', 'lat', 'lng', 'full_address'])['risk_score'].agg(['count', 'mean']).reset_index()
    top_locations = top_locations.sort_values('count', ascending=False).head(5)
    
    for _, row in top_locations.iterrows():
        folium.Marker(
            location=[row['lat'], row['lng']],
            popup=f"<b>{row['location']}</b><br>Posts: {row['count']}<br>Avg Risk: {row['mean']:.2f}",
            icon=folium.Icon(color='darkred', icon='info-sign')
        ).add_to(m)
    
    # Save the map
    m.save('reddit_crisis_heatmap.html')
    print("Saved Folium heatmap to 'reddit_crisis_heatmap.html'")
    return m, top_locations

# Create Plotly visualization
def create_plotly_map(geocoded_df):
    print("Creating Plotly map visualization...")
    
    # Group by location
    location_summary = geocoded_df.groupby(['location', 'lat', 'lng']).agg(
        count=pd.NamedAgg(column='post_id', aggfunc='nunique'),
        avg_risk=pd.NamedAgg(column='risk_score', aggfunc='mean')
    ).reset_index()
    
    # Create risk categories for color coding (Low, Medium, High)
    location_summary['risk_category'] = pd.cut(
        location_summary['avg_risk'],
        bins=[0, 3, 5, 10],
        labels=['Low Risk', 'Medium Risk', 'High Risk']
    )
    
    # Create figure
    fig = px.scatter_geo(
        location_summary,
        lat='lat',
        lon='lng',
        color='risk_category',
        size='count',
        hover_name='location',
        hover_data={
            'lat': False,
            'lng': False,
            'count': True,
            'avg_risk': ':.2f',
            'risk_category': True
        },
        projection='natural earth',
        title='Crisis Mentions by Location in Reddit Posts',
        color_discrete_map={
            'Low Risk': 'green',
            'Medium Risk': 'orange',
            'High Risk': 'red'
        }
    )
    
    fig.update_layout(
        height=600,
        geo=dict(
            showland=True,
            landcolor='rgb(217, 217, 217)',
            countrycolor='rgb(255, 255, 255)',
            coastlinecolor='rgb(255, 255, 255)',
            showocean=True,
            oceancolor='rgb(220, 230, 255)'
        )
    )
    
    # Save as HTML file
    fig.write_html('reddit_crisis_plotly_map.html')
    print("Saved Plotly map to 'reddit_crisis_plotly_map.html'")
    
    return fig, location_summary

# Generate top locations report
def generate_top_locations_report(geocoded_df):
    print("Generating top locations report...")
    
    # Group by location and calculate statistics
    location_stats = geocoded_df.groupby('location').agg(
        post_count=pd.NamedAgg(column='post_id', aggfunc='nunique'),
        avg_risk_score=pd.NamedAgg(column='risk_score', aggfunc='mean'),
        high_risk_count=pd.NamedAgg(column='risk_class', aggfunc=lambda x: sum(x == 'High Risk'))
    ).reset_index()
    
    # Sort by post count
    top_locations_by_count = location_stats.sort_values('post_count', ascending=False).head(10)
    
    # Sort by average risk score (considering only locations with at least 2 posts)
    top_locations_by_risk = location_stats[location_stats['post_count'] >= 2].sort_values('avg_risk_score', ascending=False).head(10)
    
    # Sort by high risk posts
    top_locations_by_high_risk = location_stats.sort_values('high_risk_count', ascending=False).head(10)
    
    print("\nTop 5 Locations by Post Count:")
    print(top_locations_by_count[['location', 'post_count', 'avg_risk_score']].head(5))
    
    print("\nTop 5 Locations by Average Risk Score (min 2 posts):")
    print(top_locations_by_risk[['location', 'post_count', 'avg_risk_score']].head(5))
    
    print("\nTop 5 Locations by High Risk Post Count:")
    print(top_locations_by_high_risk[['location', 'high_risk_count', 'post_count']].head(5))
    
    return top_locations_by_count, top_locations_by_risk, top_locations_by_high_risk

# Main function to run the complete analysis pipeline
def main():
    # Step 1: Load the data
    data_file = './../../data/final_reddit_data.csv'
    df = load_data(data_file)
    
    # Step 2: Extract locations (choose one method or combine both)
    # Option 1: Using spaCy NER (more accurate but slower)
    try:
        loc_df = extract_locations_spacy(df)
    except Exception as e:
        print(f"Error with spaCy extraction: {e}")
        print("Falling back to regex extraction...")
        loc_df = extract_locations_regex(df)
    
    if len(loc_df) == 0:
        print("No locations found with spaCy, trying regex approach...")
        loc_df = extract_locations_regex(df)
    
    # Step 3: Geocode the locations
    geocoded_df = geocode_locations(loc_df)
    
    # Step 4: Create visualizations
    _, top_folium_locations = create_folium_heatmap(geocoded_df)
    _, location_summary = create_plotly_map(geocoded_df)
    
    # Step 5: Generate report on top locations
    top_count, top_risk, top_high_risk = generate_top_locations_report(geocoded_df)
    
    print("\nAnalysis complete! Output files:")
    print("- reddit_crisis_heatmap.html (Folium heatmap)")
    print("- reddit_crisis_plotly_map.html (Plotly interactive map)")
    
if __name__ == "__main__":
    main()

Loading data...
Loaded 1186 posts
Loading spaCy model...
Extracting locations using spaCy...


100%|██████████| 1186/1186 [00:31<00:00, 37.74it/s]


Found 117 location mentions in 80 posts
Geocoding locations...


100%|██████████| 80/80 [01:20<00:00,  1.01s/it]

Successfully geocoded 117 out of 117 location mentions
Creating Folium heatmap...
Saved Folium heatmap to 'reddit_crisis_heatmap.html'
Creating Plotly map visualization...
Saved Plotly map to 'reddit_crisis_plotly_map.html'
Generating top locations report...

Top 5 Locations by Post Count:
      location  post_count  avg_risk_score
77          uk           7        4.318889
20      europe           5        5.446000
9   california           4        2.215000
42      mexico           4        4.952500
54          ny           3        7.190000

Top 5 Locations by Average Risk Score (min 2 posts):
   location  post_count  avg_risk_score
32    japan           2          7.8250
54       ny           3          7.1900
20   europe           5          5.4460
72    texas           2          5.4050
42   mexico           4          4.9525

Top 5 Locations by High Risk Post Count:
     location  high_risk_count  post_count
54         ny                2           3
52  nova rock                




In [10]:
import pandas as pd
import spacy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import folium
from folium.plugins import HeatMap

# Load dataset
data_path = './../../data/final_reddit_data.csv'
df = pd.read_csv(data_path)

# Initialize spaCy model for NER (place recognition)
# en_core_web_trf is more accurate but larger; fallback to en_core_web_sm if needed
try:
    nlp = spacy.load('en_core_web_trf')
except OSError:
    nlp = spacy.load('en_core_web_sm')

# Function to extract GPE entities from text
def extract_locations(text):
    doc = nlp(str(text))
    return [ent.text for ent in doc.ents if ent.label_ == 'GPE']

# Apply extraction to posts
df['locations'] = df['selftext'].apply(extract_locations)

# Explode to one location per row
df_exploded = df.explode('locations').dropna(subset=['locations'])

# Count crisis-related posts per location
# Assuming all posts are crisis-related; if not, filter by a crisis keyword column
location_counts = df_exploded['locations'].value_counts().rename_axis('location').reset_index(name='count')

# Initialize geocoder with rate limiter
geolocator = Nominatim(user_agent='crisis_heatmap_app')
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# Geocode unique locations
location_counts['geocode'] = location_counts['location'].apply(lambda loc: geocode(loc))
# Drop failed geocodes
location_counts = location_counts.dropna(subset=['geocode'])
# Extract lat/lon
location_counts['latitude'] = location_counts['geocode'].apply(lambda x: x.latitude)
location_counts['longitude'] = location_counts['geocode'].apply(lambda x: x.longitude)

# Get top 5 locations
top5 = location_counts.nlargest(5, 'count')
print("Top 5 locations with highest crisis discussions:")
print(top5[['location', 'count']])

# Create base Folium map
# Center on the mean coordinates
map_center = [location_counts['latitude'].mean(), location_counts['longitude'].mean()]
folium_map = folium.Map(location=map_center, zoom_start=2)

# Prepare heat data
heat_data = list(zip(location_counts['latitude'], location_counts['longitude'], location_counts['count']))

# Add heatmap layer
HeatMap(heat_data, radius=15, max_zoom=10).add_to(folium_map)

# Save map to HTML
output_map = 'crisis_heatmap.html'
folium_map.save(output_map)
print(f"Heatmap saved to {output_map}")

# Optionally, display within a Jupyter environment
try:
    from IPython.display import IFrame
    display(IFrame(output_map, width=800, height=600))
except ImportError:
    pass


Top 5 locations with highest crisis discussions:
  location  count
0   phobia      6
1    kinda      5
2     meth      5
3    china      5
4  florida      4
Heatmap saved to crisis_heatmap.html
