In [3]:
import pandas as pd

In [4]:
jmu_reddit_geoparsed_clean= pd.read_csv('jmu_reddit_geoparsed_long_revised.csv')
jmu_reddit_geoparsed_clean.head()


Unnamed: 0,type,date,score,year_month,Unnamed: 4,toponyms,place,latitude,longitude,revised_place,revised_latitude,revised_longitude,place_type,false_positive,checked_by
0,comment,2014-07-18 19:50:58,1.0,2014-07,The only downside is that because it is locate...,,Kaarela,60.25174,24.88111,33,38.4144850825387,-78.8340032732009,Road,,
1,comment,2013-09-13 12:46:11,5.0,2013-09,I think I remember actually seeing his house o...,,Alytaus rajonas,54.3996,24.1252,33 near Blue Hole,38.5130275516444,-79.0497270033376,Road,,"Bailey,Kendall"
2,post,2020-02-23 12:34:56,34.0,2020-02,"Before I exit on 66, she is right behind me ag...",,66,53.55325,9.96323,81 in PA,,,,,
3,comment,2024-09-25 17:44:42,4.0,2024-09,Yeah it just works somehow A&A kebab near EMU ...,,Murcia Bus Station,37.98591,-1.13947,A&A Kabob Grill,38.4678509766741,-78.8727098744419,Building,,"Jackson,Angeline Del Rosario"
4,post,2022-12-30 16:56:44,74.0,2022-12,Spotted at the Albert Icehouse and Dance Hall ...,['Albert'],Texas,31.25044,-99.25061,Albert Icehouse and Dance Hall,30.1940358956693,-98.6007530238816,Building,,"Siegmund,Hannah Rose"


In [121]:
# Check current data types and basic info
print("Current data types:")
print(jmu_reddit_geoparsed_clean.dtypes)
print("\nDataFrame info:")
print(jmu_reddit_geoparsed_clean.info())
print("\nColumn names:")
print(jmu_reddit_geoparsed_clean.columns.tolist())

Current data types:
type                  object
date                  object
score                float64
year_month            object
Unnamed: 4            object
toponyms              object
place                 object
latitude             float64
longitude             object
revised_place         object
revised_latitude      object
revised_longitude     object
place_type            object
false_positive        object
checked_by            object
dtype: object

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1786 entries, 0 to 1785
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   type               1784 non-null   object 
 1   date               1784 non-null   object 
 2   score              1784 non-null   float64
 3   year_month         1784 non-null   object 
 4   Unnamed: 4         1786 non-null   object 
 5   toponyms           1044 non-null   object 
 6   place             

In [5]:
# Data type cleaning and conversions
import pandas as pd
from datetime import datetime

# Create a copy to work with
df_clean = jmu_reddit_geoparsed_clean.copy()

print("Starting data type conversions...")

# 0. Add school_name column in first position
df_clean.insert(0, 'school_name', 'JMU')
df_clean['school_name'] = df_clean['school_name'].astype(pd.StringDtype())

# 0.1. Add unique identifier based on school_name and row number
df_clean['unique_id'] = df_clean['school_name'] + (df_clean.index + 1).astype(str)
df_clean['unique_id'] = df_clean['unique_id'].astype(pd.StringDtype())

# 1. type: convert to pandas string dtype
df_clean['type'] = df_clean['type'].astype(pd.StringDtype())

# 2. date: convert to datetime object
df_clean['date'] = pd.to_datetime(df_clean['date'], errors='coerce')

# 3. score: convert to integer (handle NaN values)
df_clean['score'] = df_clean['score'].fillna(0).astype('int64')

# 4. year_month: convert to datetime in year-month format
df_clean['year_month'] = pd.to_datetime(df_clean['year_month'], format='%Y-%m', errors='coerce')

# 5. Unnamed: 4 -> sentences: rename and convert to pandas string dtype
if 'Unnamed: 4' in df_clean.columns:
    df_clean = df_clean.rename(columns={'Unnamed: 4': 'sentences'})
df_clean['sentences'] = df_clean['sentences'].astype(pd.StringDtype())

# 6. Delete toponyms column (if it exists)
if 'toponyms' in df_clean.columns:
    df_clean = df_clean.drop('toponyms', axis=1)

# 7. place: convert to pandas string dtype
df_clean['place'] = df_clean['place'].astype(pd.StringDtype())

# 8. latitude: already float, ensure it stays float
df_clean['latitude'] = pd.to_numeric(df_clean['latitude'], errors='coerce')

# 9. longitude: convert to float
df_clean['longitude'] = pd.to_numeric(df_clean['longitude'], errors='coerce')

# 10. revised_place: convert to pandas string dtype
df_clean['revised_place'] = df_clean['revised_place'].astype(pd.StringDtype())

# 11. revised_latitude: convert to float
df_clean['revised_latitude'] = pd.to_numeric(df_clean['revised_latitude'], errors='coerce')

# 12. revised_longitude: convert to float  
df_clean['revised_longitude'] = pd.to_numeric(df_clean['revised_longitude'], errors='coerce')

# 13. place_type: convert to pandas string dtype
df_clean['place_type'] = df_clean['place_type'].astype(pd.StringDtype())

# 14. false_positive: convert to boolean
# Handle various possible boolean representations
df_clean['false_positive'] = df_clean['false_positive'].map({
    'True': True, 'true': True, 'TRUE': True, '1': True, 1: True,
    'False': False, 'false': False, 'FALSE': False, '0': False, 0: False,
    None: False, 'nan': False, '': False
}).fillna(False).astype('bool')

# 15. checked_by: convert to pandas string dtype
df_clean['checked_by'] = df_clean['checked_by'].astype(pd.StringDtype())

print("Data type conversions completed!")

# Update the main dataframe
jmu_reddit_geoparsed_clean = df_clean

Starting data type conversions...
Data type conversions completed!


  }).fillna(False).astype('bool')


In [6]:
# Fill missing values with fallback data
print("Filling missing values with fallback data...")

# If revised_place is NA, use the value from place
jmu_reddit_geoparsed_clean['revised_place'] = jmu_reddit_geoparsed_clean['revised_place'].fillna(jmu_reddit_geoparsed_clean['place'])

# If revised_latitude is missing, use latitude
jmu_reddit_geoparsed_clean['revised_latitude'] = jmu_reddit_geoparsed_clean['revised_latitude'].fillna(jmu_reddit_geoparsed_clean['latitude'])

# If revised_longitude is missing, use longitude
jmu_reddit_geoparsed_clean['revised_longitude'] = jmu_reddit_geoparsed_clean['revised_longitude'].fillna(jmu_reddit_geoparsed_clean['longitude'])

# If place_type is missing, add "Unknown"
jmu_reddit_geoparsed_clean['place_type'] = jmu_reddit_geoparsed_clean['place_type'].fillna("Unknown")

jmu_reddit_geoparsed_clean['checked_by'] = jmu_reddit_geoparsed_clean['checked_by'].fillna("None")
print("Missing value filling completed!")
print(f"Revised place null values: {jmu_reddit_geoparsed_clean['revised_place'].isnull().sum()}")
print(f"Revised latitude null values: {jmu_reddit_geoparsed_clean['revised_latitude'].isnull().sum()}")
print(f"Revised longitude null values: {jmu_reddit_geoparsed_clean['revised_longitude'].isnull().sum()}")
print(f"Place type null values: {jmu_reddit_geoparsed_clean['place_type'].isnull().sum()}")

Filling missing values with fallback data...
Missing value filling completed!
Revised place null values: 0
Revised latitude null values: 0
Revised longitude null values: 0
Place type null values: 0


In [124]:
jmu_reddit_geoparsed_clean.head()

Unnamed: 0,school_name,type,date,score,year_month,sentences,place,latitude,longitude,revised_place,revised_latitude,revised_longitude,place_type,false_positive,checked_by,unique_id
0,JMU,comment,2014-07-18 19:50:58,1,2014-07-01,The only downside is that because it is locate...,Kaarela,60.25174,24.88111,33,38.414485,-78.834003,Road,False,,JMU1
1,JMU,comment,2013-09-13 12:46:11,5,2013-09-01,I think I remember actually seeing his house o...,Alytaus rajonas,54.3996,24.1252,33 near Blue Hole,38.513028,-79.049727,Road,False,"Bailey,Kendall",JMU2
2,JMU,post,2020-02-23 12:34:56,34,2020-02-01,"Before I exit on 66, she is right behind me ag...",66,53.55325,9.96323,81 in PA,53.55325,9.96323,Unknown,False,,JMU3
3,JMU,comment,2024-09-25 17:44:42,4,2024-09-01,Yeah it just works somehow A&A kebab near EMU ...,Murcia Bus Station,37.98591,-1.13947,A&A Kabob Grill,38.467851,-78.87271,Building,False,"Jackson,Angeline Del Rosario",JMU4
4,JMU,post,2022-12-30 16:56:44,74,2022-12-01,Spotted at the Albert Icehouse and Dance Hall ...,Texas,31.25044,-99.25061,Albert Icehouse and Dance Hall,30.194036,-98.600753,Building,False,"Siegmund,Hannah Rose",JMU5


In [125]:
# Check for places with inconsistent coordinates
print("Checking for places with multiple coordinate pairs...")

# Group by revised_place and check for coordinate consistency
coordinate_check = jmu_reddit_clean_no_fp.groupby('revised_place').agg({
    'revised_latitude': ['nunique', 'min', 'max'],
    'revised_longitude': ['nunique', 'min', 'max'],
    'place_type': 'first'
}).round(6)

# Flatten column names
coordinate_check.columns = ['lat_unique_count', 'lat_min', 'lat_max', 'lon_unique_count', 'lon_min', 'lon_max', 'place_type']
coordinate_check = coordinate_check.reset_index()

# Find places with multiple coordinates (inconsistent mapping)
inconsistent_places = coordinate_check[
    (coordinate_check['lat_unique_count'] > 1) | 
    (coordinate_check['lon_unique_count'] > 1)
].copy()

# Calculate coordinate ranges for inconsistent places
inconsistent_places['lat_range'] = inconsistent_places['lat_max'] - inconsistent_places['lat_min']
inconsistent_places['lon_range'] = inconsistent_places['lon_max'] - inconsistent_places['lon_min']

print(f"Total unique places: {len(coordinate_check)}")
print(f"Places with inconsistent coordinates: {len(inconsistent_places)}")

if len(inconsistent_places) > 0:
    print(f"\nPlaces with coordinate inconsistencies:")
    print(inconsistent_places.sort_values('lat_range', ascending=False))
    
    # Show specific examples with large coordinate differences
    major_issues = inconsistent_places[
        (inconsistent_places['lat_range'] > 1) | 
        (inconsistent_places['lon_range'] > 1)
    ]
    
    if len(major_issues) > 0:
        print(f"\nMajor coordinate issues (>1 degree difference):")
        for idx, row in major_issues.iterrows():
            print(f"- {row['revised_place']}: Lat range {row['lat_range']:.4f}, Lon range {row['lon_range']:.4f}")
            
            # Show all coordinate pairs for this place
            place_coords = jmu_reddit_clean_no_fp[
                jmu_reddit_clean_no_fp['revised_place'] == row['revised_place']
            ][['revised_place', 'revised_latitude', 'revised_longitude']].drop_duplicates()
            print(f"  Coordinate pairs:")
            for _, coord in place_coords.iterrows():
                print(f"    Lat: {coord['revised_latitude']:.6f}, Lon: {coord['revised_longitude']:.6f}")
            print()
else:
    print("No coordinate inconsistencies found!")

# Store the results for further analysis
inconsistent_coordinates_report = inconsistent_places

Checking for places with multiple coordinate pairs...
Total unique places: 410
Places with inconsistent coordinates: 14

Places with coordinate inconsistencies:
              revised_place  lat_unique_count    lat_min    lat_max  \
308               Spotswood                 2  40.391770  45.900110   
232                New York                 2  40.709166  43.000350   
193               Lynchburg                 3  35.283140  37.413750   
234             New Zealand                 2 -42.835787 -42.000000   
217             Miller Hall                 2  42.409260  42.979440   
291       Shenandoah Valley                 3  38.483460  38.597600   
304       South Main Street                 2  38.421648  38.428375   
277                   Salem                 2  37.293500  37.294959   
343               The Union                 3  38.437726  38.437957   
159         JMU Warren Hall                 2  38.437889  38.438084   
268                 Roanoke                 2  37.270909  

In [126]:

# Filter out rows where false_positive is True
jmu_reddit_clean_no_fp = jmu_reddit_geoparsed_clean[jmu_reddit_geoparsed_clean['false_positive'] == False].copy()

# Show summary of false positive values in original data
print(f"\nFalse positive distribution in original data:")
print(jmu_reddit_geoparsed_clean['false_positive'].value_counts())



False positive distribution in original data:
false_positive
False    1676
True      110
Name: count, dtype: int64


In [127]:
# Group by revised_place and count occurrences while keeping all columns except sentences
print("Counting occurrences by revised_place and keeping all other columns...")

# Drop sentences column first
df_no_sentences = jmu_reddit_clean_no_fp.drop('sentences', axis=1)

# Group by revised_place and aggregate
place_counts_with_data = df_no_sentences.groupby('revised_place').agg({
   
    'place': 'first',  # Keep first occurrence of original place
    'latitude': 'first',  # Keep first occurrence of latitude
    'longitude': 'first',  # Keep first occurrence of longitude
    'revised_latitude': 'first',  # Keep first occurrence of revised_latitude
    'revised_longitude': 'first',  # Keep first occurrence of revised_longitude
    'place_type': 'first',  # Keep first occurrence of place_type
   
   
}).reset_index()

# Add count column
place_counts_with_data['count'] = df_no_sentences.groupby('revised_place').size().values

# Sort by count in descending order
place_counts_with_data_sorted = place_counts_with_data.sort_values('count', ascending=False)

print(f"Total unique places: {len(place_counts_with_data_sorted)}")
print(f"Columns in result: {list(place_counts_with_data_sorted.columns)}")

# Display the results
print(f"\nPlace data with counts (top 10):")
print(place_counts_with_data_sorted.head(10))

# Show full dataset
place_counts_with_data_sorted

Counting occurrences by revised_place and keeping all other columns...
Total unique places: 410
Columns in result: ['revised_place', 'place', 'latitude', 'longitude', 'revised_latitude', 'revised_longitude', 'place_type', 'count']

Place data with counts (top 10):
                revised_place                          place  latitude  \
121              Harrisonburg                   Burg-Grambke  53.15039   
371                  Virginia                   Commonwealth  14.69690   
354  United States of America   Embassy of the United States -35.30653   
230                New Jersey            Bailiwick of Jersey  49.21667   
153                JMU D-Hall  D and L Plaza Shopping Center  42.89950   
152                       JMU       Arizona State University  33.41728   
93                    Florida                        Florida  28.75054   
38              Chandler Hall                         Basalt  39.36887   
306      Southview Apartments                          Hills  41.6457

Unnamed: 0,revised_place,place,latitude,longitude,revised_latitude,revised_longitude,place_type,count
121,Harrisonburg,Burg-Grambke,53.15039,8.71398,38.446873,-78.864233,City,295
371,Virginia,Commonwealth,14.69690,121.08006,37.548120,-77.446750,State,155
354,United States of America,Embassy of the United States,-35.30653,149.11623,37.090200,-95.712900,Country,82
230,New Jersey,Bailiwick of Jersey,49.21667,-2.11667,39.559308,-74.851703,State,43
153,JMU D-Hall,D and L Plaza Shopping Center,42.89950,-78.68447,38.437603,78.872241,Building,41
...,...,...,...,...,...,...,...,...
389,West Side of the Mountain Cemetery,West Side of the Mountain Cemetery,41.99926,-72.74509,41.999260,-72.745090,Unknown,1
392,Western,Western,46.86870,-123.94767,46.868700,-123.947670,Unknown,1
393,Westover,Westover,38.48207,-78.02055,38.482070,-78.020550,Unknown,1
8,Arcadia,Arcadia,40.78118,-78.85253,38.426101,-78.861880,Building,1


In [128]:
# Create interactive scattermap with Plotly mapbox
import plotly.express as px
import plotly.graph_objects as go

print("Creating interactive scattermap...")

# Create the scattermap
fig = px.scatter_map(
    place_counts_with_data_sorted,
    lat='revised_latitude',
    lon='revised_longitude',
    size='count',
    color='place_type',
    hover_name='revised_place',
    hover_data={
        'count': True,
        'place_type': True,
        'revised_latitude': ':.4f',
        'revised_longitude': ':.4f'
    },
    size_max=50,  # Maximum bubble size
    zoom=6,       # Initial zoom level
    title='Reddit Posts by Location: Size = Count, Color = Type'
)

# Update layout for better appearance
fig.update_layout(
    map_style="carto-positron",  # Use OpenStreetMap (no token required)
    map=dict(center=dict(lat=38.5, lon=-78.5), zoom=6),  # Center on Virginia
    margin={"r": 0, "t": 50, "l": 0, "b": 0},
    height=700,
    width=1000,
)

# Show the map
fig.show()

# Print summary statistics

Creating interactive scattermap...


In [1]:
# Module 1: Import Libraries and Initialize
import os
import glob
import pandas as pd
import numpy as np
from datetime import datetime
import pickle
import re
from pathlib import Path

# Import libraries for geoparsing and sentiment analysis
try:
    from geoparser import Geoparser
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    from scipy.special import softmax
    from tqdm import tqdm
    import nltk
    import warnings
    warnings.filterwarnings('ignore')
    print("✅ All required libraries imported successfully!")
except ImportError as e:
    print(f"❌ Missing library: {e}")
    print("Please install required libraries first")

print("📋 Module 1: Library imports complete")
print("💡 Use the modular pipeline (Cells 13-20) for processing files")
print("💡 This legacy monolithic approach is kept for reference only")

✅ All required libraries imported successfully!
📋 Module 1: Library imports complete
💡 Use the modular pipeline (Cells 13-20) for processing files
💡 This legacy monolithic approach is kept for reference only


In [2]:
# Module 2: Data Ingestion and Cleaning Functions
import nltk

def download_nltk_requirements():
    """Download required NLTK data if not already present."""
    try:
        nltk.data.find('tokenizers/punkt')
        print("  ✅ NLTK punkt tokenizer already available")
    except LookupError:
        print("  📥 Downloading NLTK punkt tokenizer...")
        nltk.download('punkt')
        
    try:
        nltk.data.find('tokenizers/punkt_tab')
        print("  ✅ NLTK punkt_tab tokenizer already available")
    except LookupError:
        print("  📥 Downloading NLTK punkt_tab tokenizer...")
        nltk.download('punkt_tab')

def remove_emojis(text):
    """Remove emojis from text following lesson 3 protocol."""
    if pd.isna(text) or text == '':
        return text
    
    # Define emoji pattern covering most emoji Unicode ranges (from lesson 3)
    emoji_pattern = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]'
    
    # Remove emojis by replacing them with empty strings
    cleaned_text = re.sub(emoji_pattern, '', str(text))
    return cleaned_text

def split_into_sentences_nltk(text):
    """Split text into sentences using NLTK tokenizer (lesson 4 protocol)."""
    if pd.isna(text) or text == '':
        return []
    
    try:
        # Use NLTK's sentence tokenizer (from lesson 4)
        sentences = nltk.sent_tokenize(str(text))
        # Filter out very short sentences
        sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
        return sentences
    except Exception as e:
        print(f"    ⚠️ NLTK tokenization error: {e}")
        # Fallback to basic splitting
        sentences = re.split(r'[.!?]+', str(text))
        sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
        return sentences

def clean_datatypes(df, school_name):
    """Clean and convert datatypes following lesson 3 protocol."""
    print(f"  🧹 Cleaning data types for {school_name}...")
    
    # Create a copy to work with
    df_clean = df.copy()
    
    # Add school_name and unique_id columns
    df_clean.insert(0, 'school_name', school_name)
    df_clean['school_name'] = df_clean['school_name'].astype(pd.StringDtype())
    
    df_clean['unique_id'] = df_clean['school_name'] + '_' + (df_clean.index + 1).astype(str)
    df_clean['unique_id'] = df_clean['unique_id'].astype(pd.StringDtype())
    
    # Convert data types following lesson 3 protocol
    if 'type' in df_clean.columns:
        df_clean['type'] = df_clean['type'].astype('category')  # Post type as category (lesson 3)
    
    if 'title' in df_clean.columns:
        df_clean['title'] = df_clean['title'].astype(pd.StringDtype())  # Title as string
    
    if 'text' in df_clean.columns:
        df_clean['text'] = df_clean['text'].astype(pd.StringDtype())  # Text as string
    
    if 'date' in df_clean.columns:
        df_clean['date'] = pd.to_datetime(df_clean['date'])  # Date as datetime (lesson 3)
    
    if 'score' in df_clean.columns:
        df_clean['score'] = pd.to_numeric(df_clean['score'])  # Score as numeric (lesson 3)
    
    if 'year_month' in df_clean.columns:
        df_clean['year_month'] = pd.to_datetime(df_clean['year_month'], format='%Y-%m', errors='coerce')
    
    # Handle sentences column
    if 'Unnamed: 4' in df_clean.columns:
        df_clean = df_clean.rename(columns={'Unnamed: 4': 'sentences'})
    if 'sentences' in df_clean.columns:
        df_clean['sentences'] = df_clean['sentences'].astype(pd.StringDtype())
    
    # Remove toponyms if exists
    if 'toponyms' in df_clean.columns:
        df_clean = df_clean.drop('toponyms', axis=1)
    
    print(f"  ✅ Data types cleaned: {len(df_clean)} rows")
    return df_clean

def clean_text_data(df_clean):
    """Clean text data following lesson 3 protocol (emoji removal)."""
    print(f"  🧼 Cleaning text data (removing emojis)...")
    
    # Clean text columns by removing emojis (lesson 3 protocol)
    text_columns = ['text', 'title', 'sentences']
    
    for col in text_columns:
        if col in df_clean.columns:
            print(f"    🔧 Cleaning {col} column...")
            df_clean[col] = df_clean[col].apply(remove_emojis)
    
    print(f"  ✅ Text data cleaned")
    return df_clean

def process_ingest_phase(csv_path):
    """Process ingestion phase following lesson protocols."""
    print(f"📁 INGESTION: Processing {csv_path}")
    
    try:
        # Download NLTK requirements first
        download_nltk_requirements()
        
        # Load the CSV file
        df = pd.read_csv(csv_path)
        print(f"  📊 Loaded {len(df)} rows")
        
        # Extract school name from file path
        school_name = Path(csv_path).stem.upper()
        
        # Step 1: Clean data types (lesson 3 protocol)
        df_clean = clean_datatypes(df, school_name)
        
        # Step 2: Clean text data - remove emojis (lesson 3 protocol)
        df_clean = clean_text_data(df_clean)
        
        # Step 3: Split posts into sentences using NLTK (lesson 4 protocol)
        print(f"  ✂️ Splitting posts into sentences with NLTK...")
        
        # Determine which column contains the main text content
        text_column = None
        if 'text' in df_clean.columns:
            text_column = 'text'
        elif 'sentences' in df_clean.columns:
            text_column = 'sentences'
        else:
            print(f"  ⚠️ No text column found")
            return None, school_name
        
        # Apply NLTK sentence tokenization (lesson 4 method)
        print(f"  🔍 Tokenizing {text_column} column...")
        df_with_sentence_lists = df_clean.assign(
            sentences=df_clean[text_column].apply(split_into_sentences_nltk)
        )
        
        # Explode sentence lists into individual rows (lesson 4 method)
        df_sentences = df_with_sentence_lists.explode('sentences').reset_index(drop=True)
        
        # Filter out empty sentences
        df_sentences = df_sentences[df_sentences['sentences'].notna()]
        df_sentences = df_sentences[df_sentences['sentences'].str.len() > 0]
        
        # Drop original text columns to save memory (lesson 4 protocol)
        columns_to_drop = ['text', 'title'] if text_column == 'text' else []
        if columns_to_drop:
            df_sentences = df_sentences.drop(columns=columns_to_drop)
        
        # Update unique IDs for sentence-level data
        df_sentences['unique_id'] = [f"{school_name}_{i}" for i in range(len(df_sentences))]
        
        if len(df_sentences) == 0:
            print(f"  ⚠️ No valid sentences found after tokenization")
            return None, school_name
        
        print(f"  📝 Created {len(df_sentences)} sentence records")
        
        return df_sentences, school_name
        
    except Exception as e:
        print(f"  ❌ Ingestion error: {e}")
        return None, None

In [3]:
# Module 3: Geoparsing Functions
def initialize_geoparser():
    """Initialize the geoparser with configurable settings."""
    try:
        print("  🌍 Initializing geoparser...")
        
        # Configurable geoparser settings
        geo = Geoparser(
            spacy_model='en_core_web_trf',  # Options: en_core_web_sm, en_core_web_md, en_core_web_lg, en_core_web_trf
            transformer_model='dguzh/geo-all-distilroberta-v1',  # Geographic transformer model
            gazetteer='geonames'  # Options: geonames, osm
        )
        print("  ✅ Geoparser initialized successfully!")
        return geo
    except Exception as e:
        print(f"  ❌ Error initializing geoparser: {e}")
        return None

def geoparse_sentences(df_sentences, geo):
    """Apply geoparsing to sentences dataframe."""
    if not geo or df_sentences is None or len(df_sentences) == 0:
        print("  ⚠️ No data or geoparser not initialized")
        return None
    
    print(f"  🗺️ Geoparsing {len(df_sentences)} sentences...")
    
    try:
        sentences_list = df_sentences['sentences'].tolist()
        
        # Parse all sentences
        docs = geo.parse(sentences_list)
        places, latitudes, longitudes, feature_names = [], [], [], []
        
        for doc in tqdm(docs, desc="  Extracting locations"):
            doc_places = []
            doc_latitudes = []
            doc_longitudes = []
            doc_feature_names = []
            
            for toponym in doc.toponyms:
                if toponym.location:
                    doc_places.append(toponym.location.get('name'))
                    doc_latitudes.append(toponym.location.get('latitude'))
                    doc_longitudes.append(toponym.location.get('longitude'))
                    doc_feature_names.append(toponym.location.get('feature_name'))
            
            places.append(doc_places)
            latitudes.append(doc_latitudes)
            longitudes.append(doc_longitudes)
            feature_names.append(doc_feature_names)
        
        # Add location data to dataframe
        df_with_locations = df_sentences.copy()
        df_with_locations['place'] = places
        df_with_locations['latitude'] = latitudes  
        df_with_locations['longitude'] = longitudes
        df_with_locations['feature_name'] = feature_names
        
        # Filter to only sentences with locations
        df_geo = df_with_locations[df_with_locations['place'].apply(lambda x: len(x) > 0)].copy()
        print(f"  📍 Found locations in {len(df_geo)}/{len(df_sentences)} sentences")
        
        if len(df_geo) == 0:
            print("  ⚠️ No locations found")
            return None
        
        # Explode the location arrays to create one row per location
        df_geo_long = df_geo.explode(['place', 'latitude', 'longitude', 'feature_name']).reset_index(drop=True)
        print(f"  🎯 Created {len(df_geo_long)} location records")
        
        return df_geo_long
        
    except Exception as e:
        print(f"  ❌ Geoparsing error: {e}")
        return None

def process_geoparsing_phase(df_sentences):
    """Process geoparsing phase."""
    print(f"🗺️  GEOPARSING: Processing {len(df_sentences) if df_sentences is not None else 0} sentences")
    
    # Initialize geoparser
    geo = initialize_geoparser()
    if not geo:
        return None
    
    # Apply geoparsing
    df_geo_result = geoparse_sentences(df_sentences, geo)
    
    return df_geo_result

In [4]:
# Module 4: RoBERTa Sentiment Analysis Functions
def initialize_roberta_sentiment():
    """Initialize RoBERTa sentiment analysis model."""
    try:
        print("  🤖 Initializing RoBERTa sentiment model...")
        
        # Configurable model settings
        MODEL = "cardiffnlp/twitter-roberta-base-sentiment"  # Can be changed to other sentiment models
        
        tokenizer = AutoTokenizer.from_pretrained(MODEL)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL)
        
        print("  ✅ RoBERTa model initialized successfully!")
        return tokenizer, model
    except Exception as e:
        print(f"  ❌ Error initializing RoBERTa model: {e}")
        return None, None

def roberta_sentiment_analysis(text, tokenizer, model):
    """Calculate RoBERTa sentiment scores for a single text."""
    try:
        # Tokenize and truncate to max length
        encoded_text = tokenizer.encode_plus(
            text, 
            max_length=512, 
            truncation=True, 
            return_tensors='pt'
        )
        
        # Get model output and convert to probabilities
        output = model(**encoded_text)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        
        return {
            'roberta_neg': float(scores[0]),
            'roberta_neu': float(scores[1]),
            'roberta_pos': float(scores[2]),
            'roberta_compound': float((scores[2] - scores[0]) * (1 - scores[1]))  # Compound score
        }
    except Exception as e:
        print(f"    ⚠️ Sentiment analysis error for text: {str(text)[:50]}... Error: {e}")
        return {
            'roberta_neg': None,
            'roberta_neu': None, 
            'roberta_pos': None,
            'roberta_compound': None
        }

def process_sentiment_analysis(df_geo, batch_size=50):
    """Apply RoBERTa sentiment analysis to geoparsed dataframe."""
    if df_geo is None or len(df_geo) == 0:
        print("  ⚠️ No geoparsed data to analyze")
        return None
    
    print(f"  😊 Running RoBERTa sentiment analysis on {len(df_geo)} records...")
    
    # Initialize RoBERTa model
    tokenizer, model = initialize_roberta_sentiment()
    if not tokenizer or not model:
        return None
    
    try:
        # Process sentiment in batches to manage memory
        roberta_results = []
        
        for i in tqdm(range(0, len(df_geo), batch_size), desc="  Processing sentiment batches"):
            batch_end = min(i + batch_size, len(df_geo))
            batch_sentences = df_geo['sentences'].iloc[i:batch_end]
            
            batch_results = []
            for sentence in batch_sentences:
                if pd.notna(sentence):
                    result = roberta_sentiment_analysis(sentence, tokenizer, model)
                else:
                    result = {'roberta_neg': None, 'roberta_neu': None, 'roberta_pos': None, 'roberta_compound': None}
                batch_results.append(result)
            
            roberta_results.extend(batch_results)
        
        # Convert results to dataframe and combine
        sentiment_df = pd.DataFrame(roberta_results)
        df_final = pd.concat([df_geo.reset_index(drop=True), sentiment_df.reset_index(drop=True)], axis=1)
        
        print(f"  ✅ Sentiment analysis complete: {len(df_final)} records processed")
        return df_final
        
    except Exception as e:
        print(f"  ❌ Sentiment analysis error: {e}")
        return None

def process_sentiment_phase(df_geo, batch_size=50):
    """Process sentiment analysis phase."""
    print(f"😊 SENTIMENT: Analyzing {len(df_geo) if df_geo is not None else 0} records")
    
    if df_geo is None:
        return None
    
    return process_sentiment_analysis(df_geo, batch_size)

In [5]:
# Module 5: Final Processing and Export Functions
def final_cleanup_datatypes(df_final):
    """Final data type cleanup and validation."""
    if df_final is None or len(df_final) == 0:
        return None
    
    print(f"  🧼 Final cleanup and data type conversion...")
    
    try:
        df_clean = df_final.copy()
        
        # Drop unwanted columns (keeping date and year_month as requested)
        columns_to_drop = ['type', 'score', 'feature_name', 'roberta_neg', 'roberta_neu', 'roberta_pos']
        existing_columns_to_drop = [col for col in columns_to_drop if col in df_clean.columns]
        if existing_columns_to_drop:
            print(f"    🗑️ Dropping columns: {existing_columns_to_drop}")
            df_clean = df_clean.drop(columns=existing_columns_to_drop)
        
        # Add new empty columns for future manual verification
        new_columns = {
            'revised_place': pd.Series(dtype=pd.StringDtype()),
            'revised_latitude': pd.Series(dtype='float64'),
            'revised_longitude': pd.Series(dtype='float64'),
            'place_type': pd.Series(dtype=pd.StringDtype()),
            'false_positive': pd.Series([pd.NA] * len(df_clean), dtype='boolean'),  # Use pd.NA for boolean
            'checked_by': pd.Series(dtype=pd.StringDtype())
        }
        
        for col_name, empty_series in new_columns.items():
            if col_name not in df_clean.columns:
                df_clean[col_name] = empty_series
                print(f"    ➕ Added column: {col_name}")
        
        # Reorder columns according to specifications
        print(f"    📋 Reordering columns...")
        
        # Define the desired column order
        ordered_columns = []
        
        # Start with school_name and unique_id
        if 'school_name' in df_clean.columns:
            ordered_columns.append('school_name')
        if 'unique_id' in df_clean.columns:
            ordered_columns.append('unique_id')
        
        # Add date and year_month columns
        if 'date' in df_clean.columns:
            ordered_columns.append('date')
        if 'year_month' in df_clean.columns:
            ordered_columns.append('year_month')
        
        # Add sentences, then roberta_compound
        if 'sentences' in df_clean.columns:
            ordered_columns.append('sentences')
        if 'roberta_compound' in df_clean.columns:
            ordered_columns.append('roberta_compound')
        
        # Add place and coordinates
        if 'place' in df_clean.columns:
            ordered_columns.append('place')
        if 'latitude' in df_clean.columns:
            ordered_columns.append('latitude')
        if 'longitude' in df_clean.columns:
            ordered_columns.append('longitude')
        
        # Add the new verification columns after longitude
        verification_cols = ['revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']
        for col in verification_cols:
            if col in df_clean.columns:
                ordered_columns.append(col)
        
        # Add any remaining columns that weren't specified
        remaining_cols = [col for col in df_clean.columns if col not in ordered_columns]
        ordered_columns.extend(remaining_cols)
        
        # Reorder the dataframe
        df_clean = df_clean[ordered_columns]
        
        # Convert string columns to proper StringDtype
        string_columns = ['school_name', 'unique_id', 'sentences', 'place', 'revised_place', 'place_type', 'checked_by']
        for col in string_columns:
            if col in df_clean.columns:
                df_clean[col] = df_clean[col].astype(pd.StringDtype())
        
        # Convert numeric columns  
        float_columns = ['latitude', 'longitude', 'roberta_compound', 'revised_latitude', 'revised_longitude']
        for col in float_columns:
            if col in df_clean.columns:
                df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
        
        # Convert boolean columns (keep pd.NA as the default)
        if 'false_positive' in df_clean.columns:
            df_clean['false_positive'] = df_clean['false_positive'].astype('boolean')  # Use nullable boolean type
        
        # Convert datetime columns (keep date and year_month as requested)
        datetime_columns = ['date', 'year_month']
        for col in datetime_columns:
            if col in df_clean.columns:
                df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
        
        print(f"  ✅ Data types finalized: {len(df_clean)} records")
        print(f"  📊 Final columns: {list(df_clean.columns)}")
        return df_clean
        
    except Exception as e:
        print(f"  ❌ Final cleanup error: {e}")
        return df_final

def save_processed_data(df_final, csv_path, school_name):
    """Save the processed dataframe as both pickle and CSV files."""
    if df_final is None or len(df_final) == 0:
        print(f"  ⚠️ No data to save for {school_name}")
        return False
    
    try:
        # Create output paths
        pickle_path = csv_path.replace('.csv', '_processed.pickle')
        csv_output_path = csv_path.replace('.csv', '_processed.csv')
        
        # Save as pickle (preserves all data types including pd.NA)
        df_final.to_pickle(pickle_path)
        print(f"  💾 Saved pickle: {pickle_path}")
        
        # Save as CSV (for easy viewing/sharing)
        df_final.to_csv(csv_output_path, index=False)
        print(f"  💾 Saved CSV: {csv_output_path}")
        
        print(f"  📊 Final dataset: {len(df_final)} records with {len(df_final.columns)} columns")
        
        # Print column summary
        print(f"  📋 Columns: {', '.join(df_final.columns[:10])}{'...' if len(df_final.columns) > 10 else ''}")
        
        return True
        
    except Exception as e:
        print(f"  ❌ Save error: {e}")
        return False

def process_final_phase(df_sentiment, csv_path, school_name):
    """Process final cleanup and export phase."""
    print(f"🧼 FINAL: Cleaning and exporting {school_name}")
    
    if df_sentiment is None:
        print(f"  ⚠️ No sentiment data to process")
        return False
    
    # Final cleanup
    df_final = final_cleanup_datatypes(df_sentiment)
    
    # Save processed data
    success = save_processed_data(df_final, csv_path, school_name)
    
    return success

In [6]:
# Module 6: Batch Processing Orchestrator
def process_single_file_modular(csv_path):
    """Process a single CSV file through all modular phases."""
    print(f"\n{'='*60}")
    print(f"🎯 PROCESSING: {csv_path}")
    print(f"{'='*60}")
    
    try:
        # Phase 1: Ingestion
        df_sentences, school_name = process_ingest_phase(csv_path)
        if df_sentences is None:
            print(f"❌ Failed at ingestion phase")
            return False
        
        # Phase 2: Geoparsing
        df_geo = process_geoparsing_phase(df_sentences)
        if df_geo is None:
            print(f"❌ Failed at geoparsing phase")
            return False
        
        # Phase 3: Sentiment Analysis
        df_sentiment = process_sentiment_phase(df_geo, batch_size=50)
        if df_sentiment is None:
            print(f"❌ Failed at sentiment analysis phase")
            return False
        
        # Phase 4: Final Processing
        success = process_final_phase(df_sentiment, csv_path, school_name)
        
        if success:
            print(f"✅ COMPLETED: {school_name}")
        else:
            print(f"❌ FAILED: {school_name}")
            
        return success
        
    except Exception as e:
        print(f"❌ CRITICAL ERROR: {e}")
        return False

def batch_process_all_files():
    """Main orchestrator for batch processing all CSV files."""
    print("🚀 STARTING MODULAR BATCH PROCESSING")
    print("="*60)
    
    # Find all CSV files in group_data_packets
    csv_pattern = "group_data_packets/**/python/*.csv"
    all_csv_files = glob.glob(csv_pattern, recursive=True)
    
    # Filter out files that have already been processed (contain '_processed' in name)
    csv_files = [f for f in all_csv_files if '_processed' not in Path(f).name]
    
    if not all_csv_files:
        print("❌ No CSV files found in group_data_packets directory")
        return
    
    if not csv_files:
        print("✅ All CSV files have already been processed!")
        print(f"Found {len(all_csv_files)} total CSV files, all contain '_processed' in name")
        return
    
    skipped_files = [f for f in all_csv_files if '_processed' in Path(f).name]
    if skipped_files:
        print(f"⏭️ Skipping {len(skipped_files)} already processed files:")
        for f in skipped_files:
            print(f"  - {Path(f).name}")
        print()
    
    print(f"📋 Found {len(csv_files)} CSV files to process:")
    for i, f in enumerate(csv_files, 1):
        print(f"  {i}. {f}")
    
    # Process each file
    successful = []
    failed = []
    
    for i, csv_path in enumerate(csv_files, 1):
        print(f"\n🏃‍♂️ Processing file {i}/{len(csv_files)}")
        
        success = process_single_file_modular(csv_path)
        
        if success:
            successful.append(csv_path)
        else:
            failed.append(csv_path)
    
    # Final summary
    print(f"\n🎉 BATCH PROCESSING COMPLETE!")
    print(f"="*60)
    print(f"✅ Successfully processed: {len(successful)}/{len(csv_files)} files")
    if failed:
        print(f"❌ Failed files:")
        for f in failed:
            print(f"  - {f}")
    
    if successful:
        print(f"✅ Successful files:")
        for f in successful:
            pickle_name = Path(f).stem + "_processed.pickle"
            print(f"  - {pickle_name}")

# Configuration cell - modify these settings before running
print("🔧 CONFIGURATION SETTINGS:")
print("="*40)
print("📝 Sentence splitting: Regex on [.!?]+ with min 10 chars")
print("🗺️  Geoparser model: en_core_web_trf")
print("🤖 Sentiment model: cardiffnlp/twitter-roberta-base-sentiment")
print("📦 Batch size: 50 sentences per sentiment batch")
print("💾 Output format: {school_name}_processed.pickle")
print("\n💡 To modify settings, edit the respective module cells above!")

🔧 CONFIGURATION SETTINGS:
📝 Sentence splitting: Regex on [.!?]+ with min 10 chars
🗺️  Geoparser model: en_core_web_trf
🤖 Sentiment model: cardiffnlp/twitter-roberta-base-sentiment
📦 Batch size: 50 sentences per sentiment batch
💾 Output format: {school_name}_processed.pickle

💡 To modify settings, edit the respective module cells above!


In [136]:
# Module 7: Run the Batch Processing
# Execute this cell to start processing all CSV files
batch_process_all_files()

🚀 STARTING MODULAR BATCH PROCESSING
⏭️ Skipping 1 already processed files:
  - GMU_processed.csv

📋 Found 5 CSV files to process:
  1. group_data_packets\group_1\python\GMU.csv
  2. group_data_packets\group_2\python\ODU.csv
  3. group_data_packets\group_3\python\UVA.csv
  4. group_data_packets\group_4\python\VCU.csv
  5. group_data_packets\group_5\python\VirginiaTech.csv

🏃‍♂️ Processing file 1/5

🎯 PROCESSING: group_data_packets\group_1\python\GMU.csv
📁 INGESTION: Processing group_data_packets\group_1\python\GMU.csv
  ✅ NLTK punkt tokenizer already available
  ✅ NLTK punkt_tab tokenizer already available
  📊 Loaded 5149 rows
  🧹 Cleaning data types for GMU...
  ✅ Data types cleaned: 5149 rows
  🧼 Cleaning text data (removing emojis)...
    🔧 Cleaning text column...
    🔧 Cleaning title column...
  ✅ Text data cleaned
  ✂️ Splitting posts into sentences with NLTK...
  🔍 Tokenizing text column...
  📝 Created 9582 sentence records
🗺️  GEOPARSING: Processing 9582 sentences
  🌍 Initializin

Batches:   0%|          | 0/9582 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/587 [00:00<?, ?it/s]

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

  Extracting locations: 100%|██████████| 9582/9582 [00:01<00:00, 7454.97it/s]



  📍 Found locations in 328/9582 sentences
  🎯 Created 424 location records
😊 SENTIMENT: Analyzing 424 records
  😊 Running RoBERTa sentiment analysis on 424 records...
  🤖 Initializing RoBERTa sentiment model...
  ✅ RoBERTa model initialized successfully!
  ✅ RoBERTa model initialized successfully!


  Processing sentiment batches: 100%|██████████| 9/9 [00:30<00:00,  3.42s/it]



  ✅ Sentiment analysis complete: 424 records processed
🧼 FINAL: Cleaning and exporting GMU
  🧼 Final cleanup and data type conversion...
    🗑️ Dropping columns: ['type', 'score', 'feature_name', 'roberta_neg', 'roberta_neu', 'roberta_pos']
    ➕ Added column: revised_place
    ➕ Added column: revised_latitude
    ➕ Added column: revised_longitude
    ➕ Added column: place_type
    ➕ Added column: false_positive
    ➕ Added column: checked_by
    📋 Reordering columns...
  ✅ Data types finalized: 424 records
  📊 Final columns: ['school_name', 'unique_id', 'date', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']
  💾 Saved pickle: group_data_packets\group_1\python\GMU_processed.pickle
  💾 Saved CSV: group_data_packets\group_1\python\GMU_processed.csv
  📊 Final dataset: 424 records with 14 columns
  📋 Columns: school_name, unique_id, date, sentences, roberta_compound, p

Batches:   0%|          | 0/5831 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/297 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

  Extracting locations: 100%|██████████| 5831/5831 [00:01<00:00, 5754.93it/s]



  📍 Found locations in 265/5831 sentences
  🎯 Created 326 location records
😊 SENTIMENT: Analyzing 326 records
  😊 Running RoBERTa sentiment analysis on 326 records...
  🤖 Initializing RoBERTa sentiment model...
  ✅ RoBERTa model initialized successfully!
  ✅ RoBERTa model initialized successfully!


  Processing sentiment batches: 100%|██████████| 7/7 [00:17<00:00,  2.55s/it]



  ✅ Sentiment analysis complete: 326 records processed
🧼 FINAL: Cleaning and exporting ODU
  🧼 Final cleanup and data type conversion...
    🗑️ Dropping columns: ['type', 'score', 'feature_name', 'roberta_neg', 'roberta_neu', 'roberta_pos']
    ➕ Added column: revised_place
    ➕ Added column: revised_latitude
    ➕ Added column: revised_longitude
    ➕ Added column: place_type
    ➕ Added column: false_positive
    ➕ Added column: checked_by
    📋 Reordering columns...
  ✅ Data types finalized: 326 records
  📊 Final columns: ['school_name', 'unique_id', 'date', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']
  💾 Saved pickle: group_data_packets\group_2\python\ODU_processed.pickle
  💾 Saved CSV: group_data_packets\group_2\python\ODU_processed.csv
  📊 Final dataset: 326 records with 14 columns
  📋 Columns: school_name, unique_id, date, sentences, roberta_compound, p

Batches:   0%|          | 0/12426 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/831 [00:00<?, ?it/s]

Batches:   0%|          | 0/120 [00:00<?, ?it/s]

  Extracting locations: 100%|██████████| 12426/12426 [00:02<00:00, 5247.09it/s]



  📍 Found locations in 703/12426 sentences
  🎯 Created 872 location records
😊 SENTIMENT: Analyzing 872 records
  😊 Running RoBERTa sentiment analysis on 872 records...
  🤖 Initializing RoBERTa sentiment model...
  ✅ RoBERTa model initialized successfully!
  ✅ RoBERTa model initialized successfully!


  Processing sentiment batches: 100%|██████████| 18/18 [00:44<00:00,  2.47s/it]



  ✅ Sentiment analysis complete: 872 records processed
🧼 FINAL: Cleaning and exporting UVA
  🧼 Final cleanup and data type conversion...
    🗑️ Dropping columns: ['type', 'score', 'feature_name', 'roberta_neg', 'roberta_neu', 'roberta_pos']
    ➕ Added column: revised_place
    ➕ Added column: revised_latitude
    ➕ Added column: revised_longitude
    ➕ Added column: place_type
    ➕ Added column: false_positive
    ➕ Added column: checked_by
    📋 Reordering columns...
  ✅ Data types finalized: 872 records
  📊 Final columns: ['school_name', 'unique_id', 'date', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']
  💾 Saved pickle: group_data_packets\group_3\python\UVA_processed.pickle
  💾 Saved CSV: group_data_packets\group_3\python\UVA_processed.csv
  📊 Final dataset: 872 records with 14 columns
  📋 Columns: school_name, unique_id, date, sentences, roberta_compound, p

Batches:   0%|          | 0/10209 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/532 [00:00<?, ?it/s]

Batches:   0%|          | 0/83 [00:00<?, ?it/s]

  Extracting locations: 100%|██████████| 10209/10209 [00:01<00:00, 6215.75it/s]



  📍 Found locations in 476/10209 sentences
  🎯 Created 577 location records
😊 SENTIMENT: Analyzing 577 records
  😊 Running RoBERTa sentiment analysis on 577 records...
  🤖 Initializing RoBERTa sentiment model...
  ✅ RoBERTa model initialized successfully!
  ✅ RoBERTa model initialized successfully!


  Processing sentiment batches: 100%|██████████| 12/12 [00:38<00:00,  3.17s/it]



  ✅ Sentiment analysis complete: 577 records processed
🧼 FINAL: Cleaning and exporting VCU
  🧼 Final cleanup and data type conversion...
    🗑️ Dropping columns: ['type', 'score', 'feature_name', 'roberta_neg', 'roberta_neu', 'roberta_pos']
    ➕ Added column: revised_place
    ➕ Added column: revised_latitude
    ➕ Added column: revised_longitude
    ➕ Added column: place_type
    ➕ Added column: false_positive
    ➕ Added column: checked_by
    📋 Reordering columns...
  ✅ Data types finalized: 577 records
  📊 Final columns: ['school_name', 'unique_id', 'date', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']
  💾 Saved pickle: group_data_packets\group_4\python\VCU_processed.pickle
  💾 Saved CSV: group_data_packets\group_4\python\VCU_processed.csv
  📊 Final dataset: 577 records with 14 columns
  📋 Columns: school_name, unique_id, date, sentences, roberta_compound, p

Batches:   0%|          | 0/12846 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/1814 [00:00<?, ?it/s]

Batches:   0%|          | 0/144 [00:00<?, ?it/s]

  Extracting locations: 100%|██████████| 12846/12846 [00:02<00:00, 4986.04it/s]
  Extracting locations: 100%|██████████| 12846/12846 [00:02<00:00, 4986.04it/s]


  📍 Found locations in 778/12846 sentences
  🎯 Created 963 location records
😊 SENTIMENT: Analyzing 963 records
  😊 Running RoBERTa sentiment analysis on 963 records...
  🤖 Initializing RoBERTa sentiment model...
  ✅ RoBERTa model initialized successfully!
  ✅ RoBERTa model initialized successfully!


  Processing sentiment batches: 100%|██████████| 20/20 [00:48<00:00,  2.43s/it]

  ✅ Sentiment analysis complete: 963 records processed
🧼 FINAL: Cleaning and exporting VIRGINIATECH
  🧼 Final cleanup and data type conversion...
    🗑️ Dropping columns: ['type', 'score', 'feature_name', 'roberta_neg', 'roberta_neu', 'roberta_pos']
    ➕ Added column: revised_place
    ➕ Added column: revised_latitude
    ➕ Added column: revised_longitude
    ➕ Added column: place_type
    ➕ Added column: false_positive
    ➕ Added column: checked_by
    📋 Reordering columns...
  ✅ Data types finalized: 963 records
  📊 Final columns: ['school_name', 'unique_id', 'date', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']
  💾 Saved pickle: group_data_packets\group_5\python\VirginiaTech_processed.pickle
  💾 Saved CSV: group_data_packets\group_5\python\VirginiaTech_processed.csv
  📊 Final dataset: 963 records with 14 columns
  📋 Columns: school_name, unique_id, date, sen




In [7]:
# Module 7a: Run Single School Processing
def batch_process_single_school(school_name):
    """Process only files for a specific school (e.g., 'GMU', 'ODU', etc.)."""
    print(f"🚀 STARTING PROCESSING FOR {school_name.upper()}")
    print("="*60)
    
    # Find all CSV files in group_data_packets
    csv_pattern = "group_data_packets/**/python/*.csv"
    csv_files = glob.glob(csv_pattern, recursive=True)
    
    # Filter for the specific school
    school_files = [f for f in csv_files if Path(f).stem.upper() == school_name.upper()]
    
    if not school_files:
        print(f"❌ No CSV files found for {school_name}")
        print(f"Available files:")
        for f in csv_files:
            print(f"  - {Path(f).stem}")
        return
    
    print(f"📋 Found {len(school_files)} file(s) for {school_name}:")
    for i, f in enumerate(school_files, 1):
        print(f"  {i}. {f}")
    
    # Process each file for this school
    successful = []
    failed = []
    
    for i, csv_path in enumerate(school_files, 1):
        print(f"\n🏃‍♂️ Processing {school_name} file {i}/{len(school_files)}")
        
        success = process_single_file_modular(csv_path)
        
        if success:
            successful.append(csv_path)
        else:
            failed.append(csv_path)
    
    # Final summary
    print(f"\n🎉 {school_name.upper()} PROCESSING COMPLETE!")
    print(f"="*60)
    print(f"✅ Successfully processed: {len(successful)}/{len(school_files)} files")
    if failed:
        print(f"❌ Failed files:")
        for f in failed:
            print(f"  - {f}")
    
    if successful:
        print(f"✅ Successful files:")
        for f in successful:
            pickle_name = Path(f).stem + "_processed.pickle"
            print(f"  - {pickle_name}")

# Run processing for UNC only
batch_process_single_school("UNC")

🚀 STARTING PROCESSING FOR UNC
📋 Found 1 file(s) for UNC:
  1. group_data_packets\group_6\python\UNC.csv

🏃‍♂️ Processing UNC file 1/1

🎯 PROCESSING: group_data_packets\group_6\python\UNC.csv
📁 INGESTION: Processing group_data_packets\group_6\python\UNC.csv
  ✅ NLTK punkt tokenizer already available
  ✅ NLTK punkt_tab tokenizer already available
  📊 Loaded 4929 rows
  🧹 Cleaning data types for UNC...
  ✅ Data types cleaned: 4929 rows
  🧼 Cleaning text data (removing emojis)...
    🔧 Cleaning text column...
    🔧 Cleaning title column...
  ✅ Text data cleaned
  ✂️ Splitting posts into sentences with NLTK...
  🔍 Tokenizing text column...
  📝 Created 12441 sentence records
🗺️  GEOPARSING: Processing 12441 sentences
  🌍 Initializing geoparser...
  ✅ Geoparser initialized successfully!
  🗺️ Geoparsing 12441 sentences...
Toponym Recognition...


Batches:   0%|          | 0/12441 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/559 [00:00<?, ?it/s]

Batches:   0%|          | 0/88 [00:00<?, ?it/s]

  Extracting locations: 100%|██████████| 12441/12441 [00:01<00:00, 6465.52it/s]


  📍 Found locations in 534/12441 sentences
  🎯 Created 623 location records
😊 SENTIMENT: Analyzing 623 records
  😊 Running RoBERTa sentiment analysis on 623 records...
  🤖 Initializing RoBERTa sentiment model...
  ✅ RoBERTa model initialized successfully!


  Processing sentiment batches: 100%|██████████| 13/13 [00:31<00:00,  2.42s/it]

  ✅ Sentiment analysis complete: 623 records processed
🧼 FINAL: Cleaning and exporting UNC
  🧼 Final cleanup and data type conversion...
    🗑️ Dropping columns: ['type', 'score', 'feature_name', 'roberta_neg', 'roberta_neu', 'roberta_pos']
    ➕ Added column: revised_place
    ➕ Added column: revised_latitude
    ➕ Added column: revised_longitude
    ➕ Added column: place_type
    ➕ Added column: false_positive
    ➕ Added column: checked_by
    📋 Reordering columns...
  ✅ Data types finalized: 623 records
  📊 Final columns: ['school_name', 'unique_id', 'date', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']
  💾 Saved pickle: group_data_packets\group_6\python\UNC_processed.pickle
  💾 Saved CSV: group_data_packets\group_6\python\UNC_processed.csv
  📊 Final dataset: 623 records with 14 columns
  📋 Columns: school_name, unique_id, date, sentences, roberta_compound, p




In [9]:
unc_pickle=pd.read_pickle("group_data_packets/group_6/python/UNC_processed.pickle")
unc_pickle.head()

Unnamed: 0,school_name,unique_id,date,sentences,roberta_compound,place,latitude,longitude,revised_place,revised_latitude,revised_longitude,place_type,false_positive,checked_by
0,UNC,UNC_25,2025-03-01 02:32:32,Why would any ambitious scientist stay in the ...,-0.875106,Embassy of the United States,-35.30653,149.11623,,,,,,
1,UNC,UNC_34,2025-03-04 13:03:26,The whole USA will suffer the loss.,-0.806157,United States,39.76,-98.5,,,,,,
2,UNC,UNC_35,2025-03-06 17:09:49,"America's enemies are laughing, as was the plan.",-0.434685,United States,39.76,-98.5,,,,,,
3,UNC,UNC_37,2025-03-06 17:09:49,"Whether it were Cruz, Desantis or whomever tha...",-0.004488,Oval,51.48235,-0.11321,,,,,,
4,UNC,UNC_42,2025-03-03 22:35:44,Exactly as russia demands.,-0.373305,Russian Federation,60.0,100.0,,,,,,


In [138]:
# Check current structure of both DataFrames
print("=== Current jmu_reddit_geoparsed_clean columns ===")
print(f"Columns ({len(jmu_reddit_geoparsed_clean.columns)}): {list(jmu_reddit_geoparsed_clean.columns)}")
print(f"Shape: {jmu_reddit_geoparsed_clean.shape}")

print("\n=== GMU processed file columns ===")
print(f"Columns ({len(gmu_pickle.columns)}): {list(gmu_pickle.columns)}")
print(f"Shape: {gmu_pickle.shape}")

print("\n=== Target column order ===")
target_columns = ['school_name', 'unique_id', 'date', 'year_month', 'sentences', 'roberta_compound', 
                  'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 
                  'revised_longitude', 'place_type', 'false_positive', 'checked_by']
print(f"Target columns ({len(target_columns)}): {target_columns}")

=== Current jmu_reddit_geoparsed_clean columns ===
Columns (16): ['school_name', 'type', 'date', 'score', 'year_month', 'sentences', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by', 'unique_id']
Shape: (1786, 16)

=== GMU processed file columns ===
Columns (14): ['school_name', 'unique_id', 'date', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']
Shape: (424, 14)

=== Target column order ===
Target columns (15): ['school_name', 'unique_id', 'date', 'year_month', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']


In [12]:
# Restructure jmu_reddit_geoparsed_clean to match processed file format
print("🔄 Restructuring jmu_reddit_geoparsed_clean to match processed file format...")

# Create a copy to work with
jmu_restructured = jmu_reddit_geoparsed_clean.copy()

# Step 1: Add missing roberta_compound column (filled with NaN for now, since we don't have sentiment analysis)
if 'roberta_compound' not in jmu_restructured.columns:
    jmu_restructured['roberta_compound'] = pd.Series(dtype='float64')
    print("  ➕ Added roberta_compound column (empty - no sentiment analysis yet)")

# Step 2: Drop unwanted columns that are not in the target format
columns_to_drop = ['type', 'score']  # These are dropped in the processing pipeline
existing_columns_to_drop = [col for col in columns_to_drop if col in jmu_restructured.columns]
if existing_columns_to_drop:
    jmu_restructured = jmu_restructured.drop(columns=existing_columns_to_drop)
    print(f"  🗑️ Dropped columns: {existing_columns_to_drop}")

# Step 3: Reorder columns to match target format
target_columns = ['school_name', 'unique_id', 'date', 'year_month', 'sentences', 'roberta_compound', 
                  'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 
                  'revised_longitude', 'place_type', 'false_positive', 'checked_by']

# Only include columns that exist in the DataFrame
available_columns = [col for col in target_columns if col in jmu_restructured.columns]
jmu_restructured = jmu_restructured[available_columns]

print(f"  📋 Reordered columns to match target format")
print(f"  📊 Final shape: {jmu_restructured.shape}")
print(f"  📋 Final columns: {list(jmu_restructured.columns)}")

# Update the original DataFrame
jmu_reddit_geoparsed_clean = jmu_restructured

print("✅ Restructuring complete! jmu_reddit_geoparsed_clean now matches processed file format.")

🔄 Restructuring jmu_reddit_geoparsed_clean to match processed file format...
  ➕ Added roberta_compound column (empty - no sentiment analysis yet)
  🗑️ Dropped columns: ['type', 'score']
  📋 Reordered columns to match target format
  📊 Final shape: (1786, 15)
  📋 Final columns: ['school_name', 'unique_id', 'date', 'year_month', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']
✅ Restructuring complete! jmu_reddit_geoparsed_clean now matches processed file format.


In [140]:
# Verify the restructured DataFrame matches the target format
print("🔍 VERIFICATION: Comparing restructured DataFrame with processed file format")
print("="*70)

print("✅ jmu_reddit_geoparsed_clean (restructured):")
print(f"   Shape: {jmu_reddit_geoparsed_clean.shape}")
print(f"   Columns: {list(jmu_reddit_geoparsed_clean.columns)}")

print("\n📋 gmu_pickle (processed file reference):")
print(f"   Shape: {gmu_pickle.shape}")  
print(f"   Columns: {list(gmu_pickle.columns)}")

# Check column alignment
print("\n🎯 Column Comparison:")
jmu_cols = list(jmu_reddit_geoparsed_clean.columns)
gmu_cols = list(gmu_pickle.columns)

# The GMU file doesn't have year_month, so let's compare without it
jmu_cols_no_ym = [col for col in jmu_cols if col != 'year_month']

if jmu_cols_no_ym == gmu_cols:
    print("✅ Column order matches perfectly (excluding year_month)!")
    print(f"   JMU columns: {jmu_cols}")
    print(f"   GMU columns: {gmu_cols}")
    print(f"   Extra in JMU: year_month (as requested)")
else:
    print("❌ Column order differs:")
    print(f"   JMU: {jmu_cols_no_ym}")
    print(f"   GMU: {gmu_cols}")

# Show first few rows to verify structure
print(f"\n📊 Sample of restructured jmu_reddit_geoparsed_clean (first 3 rows):")
jmu_reddit_geoparsed_clean.head(3)

🔍 VERIFICATION: Comparing restructured DataFrame with processed file format
✅ jmu_reddit_geoparsed_clean (restructured):
   Shape: (1786, 15)
   Columns: ['school_name', 'unique_id', 'date', 'year_month', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']

📋 gmu_pickle (processed file reference):
   Shape: (424, 14)
   Columns: ['school_name', 'unique_id', 'date', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']

🎯 Column Comparison:
✅ Column order matches perfectly (excluding year_month)!
   JMU columns: ['school_name', 'unique_id', 'date', 'year_month', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']
   GMU columns: ['school_

Unnamed: 0,school_name,unique_id,date,year_month,sentences,roberta_compound,place,latitude,longitude,revised_place,revised_latitude,revised_longitude,place_type,false_positive,checked_by
0,JMU,JMU1,2014-07-18 19:50:58,2014-07-01,The only downside is that because it is locate...,,Kaarela,60.25174,24.88111,33,38.414485,-78.834003,Road,False,
1,JMU,JMU2,2013-09-13 12:46:11,2013-09-01,I think I remember actually seeing his house o...,,Alytaus rajonas,54.3996,24.1252,33 near Blue Hole,38.513028,-79.049727,Road,False,"Bailey,Kendall"
2,JMU,JMU3,2020-02-23 12:34:56,2020-02-01,"Before I exit on 66, she is right behind me ag...",,66,53.55325,9.96323,81 in PA,53.55325,9.96323,Unknown,False,


In [13]:
# 🎉 SUMMARY: jmu_reddit_geoparsed_clean DataFrame Layout Standardization
print("🎯 COMPLETED: DataFrame Layout Standardization")
print("="*60)
print("✅ The jmu_reddit_geoparsed_clean DataFrame now has the SAME layout as processed CSV files:")
print()
print("📋 Standard Column Layout (15 columns):")
for i, col in enumerate(jmu_reddit_geoparsed_clean.columns, 1):
    print(f"   {i:2d}. {col}")

print(f"\n📊 Data Summary:")
print(f"   • Shape: {jmu_reddit_geoparsed_clean.shape[0]:,} rows × {jmu_reddit_geoparsed_clean.shape[1]} columns")
print(f"   • School: {jmu_reddit_geoparsed_clean['school_name'].iloc[0]}")
print(f"   • Date range: {jmu_reddit_geoparsed_clean['date'].min()} to {jmu_reddit_geoparsed_clean['date'].max()}")
print(f"   • Sentiment analysis: Not yet populated (roberta_compound = NaN)")

print(f"\n🔄 Changes Made:")
print(f"   ✅ Added: roberta_compound column (empty, ready for sentiment analysis)")
print(f"   ✅ Removed: type, score columns (matching processed file format)")
print(f"   ✅ Reordered: All columns now match the standard processed file layout")
print(f"   ✅ Preserved: year_month column (as requested)")

print(f"\n💡 Next Steps:")
print(f"   • The DataFrame is ready for analysis and comparison with other processed files")
print(f"   • If sentiment analysis is needed, run the modular pipeline on this data")
print(f"   • The layout now matches: school_name → unique_id → date → year_month → sentences → ... → checked_by")

🎯 COMPLETED: DataFrame Layout Standardization
✅ The jmu_reddit_geoparsed_clean DataFrame now has the SAME layout as processed CSV files:

📋 Standard Column Layout (15 columns):
    1. school_name
    2. unique_id
    3. date
    4. year_month
    5. sentences
    6. roberta_compound
    7. place
    8. latitude
    9. longitude
   10. revised_place
   11. revised_latitude
   12. revised_longitude
   13. place_type
   14. false_positive
   15. checked_by

📊 Data Summary:
   • Shape: 1,786 rows × 15 columns
   • School: JMU
   • Date range: 2011-09-08 14:22:55 to 2025-08-31 13:19:41
   • Sentiment analysis: Not yet populated (roberta_compound = NaN)

🔄 Changes Made:
   ✅ Added: roberta_compound column (empty, ready for sentiment analysis)
   ✅ Removed: type, score columns (matching processed file format)
   ✅ Reordered: All columns now match the standard processed file layout
   ✅ Preserved: year_month column (as requested)

💡 Next Steps:
   • The DataFrame is ready for analysis and comp

In [14]:
# 🤖 Run RoBERTa Sentiment Analysis for jmu_reddit_geoparsed_clean DataFrame
print("🚀 Starting RoBERTa Sentiment Analysis for JMU DataFrame")
print("="*60)

# Check current state
print(f"📊 Current DataFrame shape: {jmu_reddit_geoparsed_clean.shape}")
print(f"📝 Sentences to analyze: {jmu_reddit_geoparsed_clean['sentences'].notna().sum()}")
print(f"🔍 Current roberta_compound status: {jmu_reddit_geoparsed_clean['roberta_compound'].isna().sum()} NaN values")

# Initialize RoBERTa model (using existing function from Module 4)
print("\n🤖 Initializing RoBERTa sentiment model...")
tokenizer, model = initialize_roberta_sentiment()

if tokenizer is None or model is None:
    print("❌ Failed to initialize RoBERTa model")
else:
    print("✅ RoBERTa model initialized successfully!")

🚀 Starting RoBERTa Sentiment Analysis for JMU DataFrame
📊 Current DataFrame shape: (1786, 15)
📝 Sentences to analyze: 1786
🔍 Current roberta_compound status: 1786 NaN values

🤖 Initializing RoBERTa sentiment model...
  🤖 Initializing RoBERTa sentiment model...
  ✅ RoBERTa model initialized successfully!
✅ RoBERTa model initialized successfully!


In [15]:
# 🎯 Run sentiment analysis on all sentences in batches
print("🔄 Processing sentiment analysis in batches...")

# Configuration
batch_size = 50  # Process 50 sentences at a time
total_sentences = len(jmu_reddit_geoparsed_clean)

# Create copy to work with
df_with_sentiment = jmu_reddit_geoparsed_clean.copy()

# Process sentiment in batches using existing function logic
roberta_results = []

print(f"📦 Processing {total_sentences} sentences in batches of {batch_size}")

for i in tqdm(range(0, total_sentences, batch_size), desc="Processing sentiment batches"):
    batch_end = min(i + batch_size, total_sentences)
    batch_sentences = df_with_sentiment['sentences'].iloc[i:batch_end]
    
    batch_results = []
    for sentence in batch_sentences:
        if pd.notna(sentence) and sentence.strip():
            # Use existing roberta_sentiment_analysis function
            result = roberta_sentiment_analysis(sentence, tokenizer, model)
        else:
            result = {'roberta_neg': None, 'roberta_neu': None, 'roberta_pos': None, 'roberta_compound': None}
        batch_results.append(result)
    
    roberta_results.extend(batch_results)

print(f"✅ Sentiment analysis complete! Processed {len(roberta_results)} sentences")

🔄 Processing sentiment analysis in batches...
📦 Processing 1786 sentences in batches of 50


Processing sentiment batches: 100%|██████████| 36/36 [01:32<00:00,  2.56s/it]

✅ Sentiment analysis complete! Processed 1786 sentences





In [18]:
# 📊 Update DataFrame with RoBERTa sentiment scores
print("📊 Updating DataFrame with sentiment scores...")

# Convert results to DataFrame for easier handling
sentiment_df = pd.DataFrame(roberta_results)

# Update only the roberta_compound column in the original DataFrame
# (following the same pattern as the processed files which only keep roberta_compound)
df_with_sentiment['roberta_compound'] = sentiment_df['roberta_compound']

# Update the main DataFrame
jmu_reddit_geoparsed_clean = df_with_sentiment

# Verify the update
print(f"✅ Sentiment scores updated!")
print(f"📊 DataFrame shape: {jmu_reddit_geoparsed_clean.shape}")
print(f"🔍 Non-null roberta_compound values: {jmu_reddit_geoparsed_clean['roberta_compound'].notna().sum()}")
print(f"📈 Roberta compound score range: {jmu_reddit_geoparsed_clean['roberta_compound'].min():.4f} to {jmu_reddit_geoparsed_clean['roberta_compound'].max():.4f}")

# Show some sample sentiment scores
print(f"\n📋 Sample sentiment scores:")
sample_data = jmu_reddit_geoparsed_clean[['sentences', 'roberta_compound']].head()
for idx, row in sample_data.iterrows():
    sentence = row['sentences'][:50] + "..." if len(str(row['sentences'])) > 50 else row['sentences']
    score = row['roberta_compound']
    print(f"   {score:+.4f}: {sentence}")

print(f"\n🎯 SUCCESS: All {jmu_reddit_geoparsed_clean['roberta_compound'].notna().sum()} sentences now have RoBERTa sentiment scores!")

📊 Updating DataFrame with sentiment scores...
✅ Sentiment scores updated!
📊 DataFrame shape: (1786, 15)
🔍 Non-null roberta_compound values: 1786
📈 Roberta compound score range: -0.9648 to 0.9831

📋 Sample sentiment scores:
   -0.4039: The only downside is that because it is located di...
   +0.0318: I think I remember actually seeing his house off o...
   +0.0081: Before I exit on 66, she is right behind me again.
   +0.9434: Yeah it just works somehow A&A kebab near EMU is a...
   +0.0184: Spotted at the Albert Icehouse and Dance Hall in A...

🎯 SUCCESS: All 1786 sentences now have RoBERTa sentiment scores!


In [19]:
# 🎉 FINAL VERIFICATION: Complete DataFrame with Sentiment Analysis
print("🎯 SENTIMENT ANALYSIS COMPLETE!")
print("="*60)

# Verify final structure
print("📋 Final DataFrame Structure:")
print(f"   Shape: {jmu_reddit_geoparsed_clean.shape[0]:,} rows × {jmu_reddit_geoparsed_clean.shape[1]} columns")
print(f"   Columns: {list(jmu_reddit_geoparsed_clean.columns)}")

# Sentiment analysis summary
print(f"\n🤖 RoBERTa Sentiment Analysis Summary:")
print(f"   Total sentences analyzed: {jmu_reddit_geoparsed_clean['roberta_compound'].notna().sum():,}")
print(f"   Compound score range: {jmu_reddit_geoparsed_clean['roberta_compound'].min():.4f} to {jmu_reddit_geoparsed_clean['roberta_compound'].max():.4f}")
print(f"   Mean sentiment: {jmu_reddit_geoparsed_clean['roberta_compound'].mean():.4f}")
print(f"   Positive sentences (>0): {(jmu_reddit_geoparsed_clean['roberta_compound'] > 0).sum():,} ({(jmu_reddit_geoparsed_clean['roberta_compound'] > 0).mean():.1%})")
print(f"   Negative sentences (<0): {(jmu_reddit_geoparsed_clean['roberta_compound'] < 0).sum():,} ({(jmu_reddit_geoparsed_clean['roberta_compound'] < 0).mean():.1%})")
print(f"   Neutral sentences (=0): {(jmu_reddit_geoparsed_clean['roberta_compound'] == 0).sum():,} ({(jmu_reddit_geoparsed_clean['roberta_compound'] == 0).mean():.1%})")

# Show completed dataframe sample
print(f"\n📊 Sample of Complete DataFrame (first 3 rows):")
display(jmu_reddit_geoparsed_clean.head(3))

print(f"\n✅ SUCCESS: jmu_reddit_geoparsed_clean DataFrame is now complete with:")
print(f"   • Standardized layout matching processed CSV files")
print(f"   • Full RoBERTa sentiment analysis (roberta_compound scores)")
print(f"   • Ready for analysis and comparison with other school datasets")
print(f"   • {jmu_reddit_geoparsed_clean.shape[0]:,} sentences from JMU Reddit data")

🎯 SENTIMENT ANALYSIS COMPLETE!
📋 Final DataFrame Structure:
   Shape: 1,786 rows × 15 columns
   Columns: ['school_name', 'unique_id', 'date', 'year_month', 'sentences', 'roberta_compound', 'place', 'latitude', 'longitude', 'revised_place', 'revised_latitude', 'revised_longitude', 'place_type', 'false_positive', 'checked_by']

🤖 RoBERTa Sentiment Analysis Summary:
   Total sentences analyzed: 1,786
   Compound score range: -0.9648 to 0.9831
   Mean sentiment: -0.0260
   Positive sentences (>0): 821 (46.0%)
   Negative sentences (<0): 965 (54.0%)
   Neutral sentences (=0): 0 (0.0%)

📊 Sample of Complete DataFrame (first 3 rows):


Unnamed: 0,school_name,unique_id,date,year_month,sentences,roberta_compound,place,latitude,longitude,revised_place,revised_latitude,revised_longitude,place_type,false_positive,checked_by
0,JMU,JMU1,2014-07-18 19:50:58,2014-07-01,The only downside is that because it is locate...,-0.403942,Kaarela,60.25174,24.88111,33,38.414485,-78.834003,Road,False,
1,JMU,JMU2,2013-09-13 12:46:11,2013-09-01,I think I remember actually seeing his house o...,0.0318,Alytaus rajonas,54.3996,24.1252,33 near Blue Hole,38.513028,-79.049727,Road,False,"Bailey,Kendall"
2,JMU,JMU3,2020-02-23 12:34:56,2020-02-01,"Before I exit on 66, she is right behind me ag...",0.008133,66,53.55325,9.96323,81 in PA,53.55325,9.96323,Unknown,False,



✅ SUCCESS: jmu_reddit_geoparsed_clean DataFrame is now complete with:
   • Standardized layout matching processed CSV files
   • Full RoBERTa sentiment analysis (roberta_compound scores)
   • Ready for analysis and comparison with other school datasets
   • 1,786 sentences from JMU Reddit data


In [20]:
pd.to_pickle(jmu_reddit_geoparsed_clean, "assets/data/jmu_reddit_geoparsed_clean.pickle")