In [3]:
# Code 1 
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import re
from datetime import datetime
import pytz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN, KMeans
from sklearn.ensemble import IsolationForest, ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from collections import defaultdict
import gc
import warnings
from scipy.stats import entropy
warnings.filterwarnings('ignore')

# Memory optimization - use efficient data types
dtypes = {
    'reply_count': 'int16',
    'repost_count': 'int16',
    'like_count': 'int16',
    'happy': 'int8',
    'sad': 'int8',
    'neutral': 'int8'
}

def process_in_chunks(file_path, chunk_size=100000):
    """Process large datasets in chunks to avoid memory issues"""
    chunks = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, dtype=dtypes):
        chunks.append(chunk)
    return pd.concat(chunks)

def preprocess_data(df):
    """Preprocess the data and handle missing values"""
    print("Preprocessing data...")
    
    # First, ensure we filter down to only users with 2+ posts
    # Count posts per user
    post_counts = df['author_handle'].value_counts()
    users_with_multiple_posts = post_counts[post_counts >= 2].index
    
    # Filter the dataframe to include only these users
    df = df[df['author_handle'].isin(users_with_multiple_posts)].reset_index(drop=True)
    print(f"Filtered to {len(users_with_multiple_posts)} users with 2 or more posts")
    
    # Fill missing values
    for col in ['reply_count', 'repost_count', 'like_count', 'happy', 'sad']:
        if col in df.columns:
            df[col] = df[col].fillna(0)
    
    # Add neutral sentiment column if not present
    if 'neutral' not in df.columns:
        df['neutral'] = ((df['happy'] == 0) & (df['sad'] == 0)).astype(int)
    
    # Convert timestamp to datetime with improved error handling
    if 'created_at' in df.columns:
        print(f"created_at column type before conversion: {df['created_at'].dtype}")
        print(f"Sample values before conversion: {df['created_at'].head(2).values}")
        
        # Create a copy of original timestamps before conversion attempts
        df['created_at_original'] = df['created_at']
        
        # Try with a more flexible parser first with different formats
        try:
            # Try with coerce to handle errors gracefully
            df['created_at_dt'] = pd.to_datetime(df['created_at'], errors='coerce')
            
            # Count failures after first attempt
            mask_failed = df['created_at_dt'].isna()
            print(f"First attempt conversion failures: {mask_failed.sum()}")
            
            # For failures, try explicit formats sequentially
            formats_to_try = [
                '%Y-%m-%d %H:%M:%S%z',                # 2023-04-13 10:06:17+00:00
                '%Y-%m-%d %H:%M:%S.%f%z',             # 2023-04-14 07:31:51.104000+00:00
                '%Y-%m-%dT%H:%M:%S%z',                # ISO format with timezone
                '%Y-%m-%dT%H:%M:%S.%f%z',             # ISO format with fractional seconds
                '%a %b %d %H:%M:%S %z %Y',            # Twitter-like format
                '%Y-%m-%d %H:%M:%S',                  # Without timezone
                '%Y-%m-%d'                            # Just date
            ]
            
            # Try each format for the failed rows
            for date_format in formats_to_try:
                if mask_failed.sum() > 0:
                    try:
                        # Apply format only to failed rows
                        df.loc[mask_failed, 'created_at_dt'] = pd.to_datetime(
                            df.loc[mask_failed, 'created_at'], 
                            format=date_format,
                            errors='coerce'
                        )
                        # Update mask of failed rows
                        mask_failed = df['created_at_dt'].isna()
                        print(f"After trying format {date_format}: {mask_failed.sum()} failures remain")
                    except Exception as e:
                        print(f"Error with format {date_format}: {e}")
            
            # For any remaining failures, try one more approach - strip timezone info if present
            if mask_failed.sum() > 0:
                try:
                    # Try to extract just the datetime part before the timezone for the remaining failed values
                    failed_values = df.loc[mask_failed, 'created_at'].copy()
                    # Extract datetime part before any + or - (timezone indicators)
                    cleaned_values = failed_values.str.extract(r'(.*?)(?=[+-]|$)', expand=False).str.strip()
                    df.loc[mask_failed, 'created_at_dt'] = pd.to_datetime(cleaned_values, errors='coerce')
                    
                    # Update mask again
                    mask_failed = df['created_at_dt'].isna()
                except Exception as e:
                    print(f"Error in timezone stripping approach: {e}")
            
        except Exception as e:
            print(f"Error in datetime conversion: {e}")
        
        # For any remaining failures, assign a default datetime to avoid missing values
        if 'created_at_dt' in df.columns:
            mask_still_failed = df['created_at_dt'].isna()
            if mask_still_failed.any():
                print(f"Assigning default datetime to {mask_still_failed.sum()} records that failed conversion")
                # Use median date as default for failed conversions
                valid_dates = df.loc[~df['created_at_dt'].isna(), 'created_at_dt']
                if len(valid_dates) > 0:
                    default_date = valid_dates.median()
                else:
                    default_date = pd.Timestamp('2023-01-01')
                
                df.loc[mask_still_failed, 'created_at_dt'] = default_date
        
        # Keep track of conversion success rate
        final_failed = df['created_at_dt'].isna().sum()
        print(f"Final datetime conversion failures: {final_failed} ({final_failed/len(df)*100:.2f}%)")
        
        # Copy the datetime column to the original column name too
        df['created_at'] = df['created_at_dt']
    
    # Clean up memory
    gc.collect()
    
    return df

def engineer_features(df):
    """Extract and engineer features from the data, include behavioral features"""
    print("Engineering features...")
    
    # Ensure we have a working datetime column
    if 'created_at_dt' not in df.columns and 'created_at' in df.columns:
        df['created_at_dt'] = df['created_at']
    
    # Temporal features - use created_at_dt preferentially
    datetime_col = 'created_at_dt' if 'created_at_dt' in df.columns else 'created_at'
    
    if datetime_col in df.columns:
        # Ensure it's datetime type
        if not pd.api.types.is_datetime64_any_dtype(df[datetime_col]):
            print(f"Converting {datetime_col} to datetime...")
            df[datetime_col] = pd.to_datetime(df[datetime_col], errors='coerce')
        
        # Only proceed with datetime operations for non-NaN values
        mask = ~df[datetime_col].isna()
        
        if mask.any():  # Only calculate if there are valid datetime values
            # Extract temporal features only for valid dates
            df.loc[mask, 'hour_of_day'] = df.loc[mask, datetime_col].dt.hour
            df.loc[mask, 'day_of_week'] = df.loc[mask, datetime_col].dt.dayofweek
            df.loc[mask, 'month'] = df.loc[mask, datetime_col].dt.month
            df.loc[mask, 'day'] = df.loc[mask, datetime_col].dt.day
            df.loc[mask, 'year'] = df.loc[mask, datetime_col].dt.year
            
            # NEW: Extract minute of hour for more detailed temporal patterns
            df.loc[mask, 'minute_of_hour'] = df.loc[mask, datetime_col].dt.minute
            
            # NEW: Is post during typical working hours (9am-5pm local time)
            df.loc[mask, 'is_working_hours'] = ((df.loc[mask, 'hour_of_day'] >= 9) & 
                                               (df.loc[mask, 'hour_of_day'] < 17)).astype(int)
            
            # NEW: Is post during typical sleeping hours (11pm-5am local time)
            df.loc[mask, 'is_sleeping_hours'] = ((df.loc[mask, 'hour_of_day'] >= 23) | 
                                                (df.loc[mask, 'hour_of_day'] < 5)).astype(int)
            
            # NEW: Is weekend post
            df.loc[mask, 'is_weekend'] = (df.loc[mask, 'day_of_week'] >= 5).astype(int)
        else:
            # Create default columns if no valid dates
            for col in ['hour_of_day', 'day_of_week', 'month', 'day', 'year', 
                       'minute_of_hour', 'is_working_hours', 'is_sleeping_hours', 'is_weekend']:
                df[col] = 0
    
    # Group by author and calculate posting frequency
    if 'author_handle' in df.columns:
        author_post_counts = df.groupby('author_handle').size().reset_index(name='post_count')
        df = df.merge(author_post_counts, on='author_handle', how='left')
    
    # Calculate time differences between posts for each author
    if 'author_handle' in df.columns and datetime_col in df.columns:
        # Only sort if we have valid datetime
        if pd.api.types.is_datetime64_any_dtype(df[datetime_col]):
            # Create a copy to avoid modifying during iteration
            df_sorted = df.sort_values(['author_handle', datetime_col])
            
            # Create prev_post_time
            df_sorted['prev_post_time'] = df_sorted.groupby('author_handle')[datetime_col].shift(1)
            
            # Calculate time difference safely
            valid_times = ~df_sorted[datetime_col].isna() & ~df_sorted['prev_post_time'].isna()
            
            # Initialize the column with zeros
            df_sorted['time_since_last_post'] = 0
            
            # Only calculate for valid rows
            if valid_times.any():
                df_sorted.loc[valid_times, 'time_since_last_post'] = (
                    (df_sorted.loc[valid_times, datetime_col] - 
                     df_sorted.loc[valid_times, 'prev_post_time']).dt.total_seconds() / 60
                )
            
            # Copy these columns back to the original dataframe
            # Match by index to avoid losing data
            df['prev_post_time'] = df_sorted['prev_post_time']
            df['time_since_last_post'] = df_sorted['time_since_last_post']
            
            # NEW: Calculate posting rhythm features
            # Group by author
            author_groups = df.groupby('author_handle')
            
            # Initialize new features at author level
            time_diff_stats = author_groups['time_since_last_post'].agg(['mean', 'std', 'median']).fillna(0)
            time_diff_stats.columns = ['avg_time_between_posts', 'std_time_between_posts', 'median_time_between_posts']
            
            # Add coefficient of variation (std/mean) - more regular posting has lower values
            time_diff_stats['cv_time_between_posts'] = np.divide(
                time_diff_stats['std_time_between_posts'],
                time_diff_stats['avg_time_between_posts'],
                out=np.zeros_like(time_diff_stats['std_time_between_posts']),
                where=time_diff_stats['avg_time_between_posts'] != 0
            )
            
            # Merge back to original dataframe
            df = df.merge(time_diff_stats, on='author_handle', how='left')
            
            # NEW: Calculate timing entropy for each author (more regular = more bot-like)
            def calculate_time_entropy(times):
                if len(times) <= 1:
                    return 0
                
                # Convert to hours and bin to get distribution
                hours = times.dt.hour
                bins = np.bincount(hours, minlength=24)
                probs = bins / len(hours)
                # Filter out zeros
                probs = probs[probs > 0]
                return entropy(probs)
            
            # Apply to each author group
            timing_entropy = author_groups[datetime_col].apply(calculate_time_entropy).fillna(0)
            timing_entropy = timing_entropy.reset_index()
            timing_entropy.columns = ['author_handle', 'posting_time_entropy']
            
            # Merge back
            df = df.merge(timing_entropy, on='author_handle', how='left')
            
        else:
            # If datetime is not proper, create dummy columns
            df['prev_post_time'] = pd.NaT
            df['time_since_last_post'] = 0
            df['avg_time_between_posts'] = 0
            df['std_time_between_posts'] = 0
            df['median_time_between_posts'] = 0
            df['cv_time_between_posts'] = 0
            df['posting_time_entropy'] = 0


    # Content features
    if 'post_text' in df.columns:
        df['text_length'] = df['post_text'].apply(lambda x: len(str(x)))
        df['word_count'] = df['post_text'].apply(lambda x: len(str(x).split()))
        
        # Calculate entropy of text (measure of randomness)
        df['text_entropy'] = df['post_text'].apply(calculate_entropy)
        
        # NEW: Calculate lexical diversity (unique words / total words)
        def lexical_diversity(text):
            if not isinstance(text, str) or len(text) == 0:
                return 0
            words = text.split()
            if len(words) == 0:
                return 0
            return len(set(words)) / len(words)
        
        df['lexical_diversity'] = df['post_text'].apply(lexical_diversity)
        
        # NEW: Hashtag and mention counts
        df['hashtag_count'] = df['post_text'].apply(lambda x: str(x).count('#') if isinstance(x, str) else 0)
        df['mention_count'] = df['post_text'].apply(lambda x: str(x).count('@') if isinstance(x, str) else 0)
        
        # NEW: URL count
        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        df['url_count'] = df['post_text'].apply(lambda x: len(re.findall(url_pattern, str(x))) if isinstance(x, str) else 0)
    
    # Engagement features
    if all(col in df.columns for col in ['reply_count', 'repost_count', 'like_count']):
        df['total_engagement'] = df['reply_count'] + df['repost_count'] + df['like_count']
        
        # NEW: Calculate engagement ratios
        df['likes_to_reposts_ratio'] = np.divide(
            df['like_count'], 
            df['repost_count'], 
            out=np.zeros_like(df['like_count'], dtype=float),
            where=df['repost_count'] != 0
        )
        
        df['engagement_ratio'] = np.divide(
            df['total_engagement'],
            df['text_length'] + 1,
            out=np.zeros_like(df['total_engagement'], dtype=float),
            where=(df['text_length'] + 1) != 0
        )
        
        # NEW: Calculate engagement consistency for each author
        engagement_stats = df.groupby('author_handle')['total_engagement'].agg(['mean', 'std']).fillna(0)
        engagement_stats.columns = ['avg_engagement', 'std_engagement']
        
        # Calculate coefficient of variation for engagement
        engagement_stats['cv_engagement'] = np.divide(
            engagement_stats['std_engagement'],
            engagement_stats['avg_engagement'],
            out=np.zeros_like(engagement_stats['std_engagement']),
            where=engagement_stats['avg_engagement'] != 0
        )
        
        # Merge back
        df = df.merge(engagement_stats, on='author_handle', how='left')
    
    # Sentiment features
    if all(col in df.columns for col in ['happy', 'sad', 'neutral']):
        df['sentiment_ratio'] = np.divide(
            (df['happy'] - df['sad']),
            (df['happy'] + df['sad'] + 1),
            out=np.zeros_like(df['happy'], dtype=float),
            where=(df['happy'] + df['sad'] + 1) != 0
        )
        
        # NEW: Calculate sentiment consistency for each author
        sentiment_stats = df.groupby('author_handle')['sentiment_ratio'].agg(['mean', 'std']).fillna(0)
        sentiment_stats.columns = ['avg_sentiment', 'std_sentiment']
        
        # Merge back
        df = df.merge(sentiment_stats, on='author_handle', how='left')
    
    # Content similarity and near-duplicate detection
    if 'author_handle' in df.columns and 'post_text' in df.columns:
        print("Calculating content similarity...")
        # Process in smaller groups to save memory
        author_groups = df.groupby('author_handle')
        similarity_results = []
        duplicate_results = []
        
        for author, group in author_groups:
            if len(group) > 1:
                sim_series, dup_series = calculate_similarity_features(group)
                similarity_results.append(pd.DataFrame({'author_handle': author, 'index': group.index, 'content_similarity': sim_series}))
                duplicate_results.append(pd.DataFrame({'author_handle': author, 'index': group.index, 'has_near_duplicate': dup_series}))
        
        if similarity_results:
            sim_df = pd.concat(similarity_results)
            dup_df = pd.concat(duplicate_results)
            
            df = df.merge(sim_df[['index', 'content_similarity']], left_index=True, right_on='index', how='left')
            df = df.merge(dup_df[['index', 'has_near_duplicate']], left_index=True, right_on='index', how='left')
            
            df.drop('index_x', axis=1, errors='ignore', inplace=True)
            df.drop('index_y', axis=1, errors='ignore', inplace=True)
            df.drop('index', axis=1, errors='ignore', inplace=True)
        
        df['content_similarity'] = df['content_similarity'].fillna(0)
        df['has_near_duplicate'] = df['has_near_duplicate'].fillna(0)
        
        # NEW: Calculate self-similarity ratio for each author
        self_sim_stats = df.groupby('author_handle')['content_similarity'].agg(['mean']).fillna(0)
        self_sim_stats.columns = ['avg_content_similarity']
        
        # Merge back
        df = df.merge(self_sim_stats, on='author_handle', how='left')
    
    # Clean up memory
    gc.collect()
    
    return df

def calculate_entropy(text):
    """Calculate the entropy of text as a measure of randomness"""
    if not isinstance(text, str) or len(text) == 0:
        return 0
    prob = [float(text.count(c)) / len(text) for c in set(text)]
    entropy = -sum([p * np.log2(p) for p in prob])
    return entropy

def calculate_similarity_features(author_group, threshold=0.9):
    """Calculate content similarity and detect near-duplicates for posts by the same author"""
    # Use cleaned_text specifically for similarity calculation
    texts = author_group['cleaned_text'].tolist()
    indices = author_group.index
    
    if len(texts) <= 1:
        return pd.Series([0] * len(author_group), index=indices), pd.Series([0] * len(author_group), index=indices)
    
    try:
        # Clean and validate texts
        valid_texts = []
        for text in texts:
            if isinstance(text, str) and len(str(text).strip()) > 0:
                valid_texts.append(str(text))
            else:
                valid_texts.append("empty_text")  # Placeholder for invalid texts
        
        # Use TF-IDF vectorization for text comparison
        tfidf = TfidfVectorizer(min_df=1, stop_words='english').fit_transform(valid_texts)
        similarity_matrix = cosine_similarity(tfidf)
        
        # Calculate average similarity with other posts by same author
        avg_similarities = [
            np.mean([similarity_matrix[i][j] for j in range(len(valid_texts)) if i != j]) 
            if len(valid_texts) > 1 else 0 for i in range(len(valid_texts))
        ]
        
        # Detect near-duplicates (posts with high similarity to others)
        has_near_duplicate = [(similarity_matrix[i] > threshold).sum() > 1 for i in range(len(valid_texts))]
        
        return pd.Series(avg_similarities, index=indices), pd.Series(has_near_duplicate, index=indices).astype(int)
    except Exception as e:
        print(f"Error in similarity calculation: {e}")
        return pd.Series([0] * len(author_group), index=indices), pd.Series([0] * len(author_group), index=indices)



def analyze_behavioral_patterns(df):
    """Analyze behavioral patterns to identify bot-like activities"""
    print("Analyzing behavioral patterns...")
    
    # NEW: Create user posting behavior profiles
    if 'author_handle' in df.columns and 'created_at_dt' in df.columns:
        # Time-based behavior patterns
        
        # 1. Benford's Law Analysis for time intervals
        # Per the recent research, legitimate users often follow Benford's Law in their activity patterns
        # while bots often deviate from it
        def calculate_benford_deviation(intervals):
            if len(intervals) < 10:  # Need enough data points
                return 0
            
            # Get first digits
            first_digits = [int(str(int(abs(i))).lstrip('0')[0]) if i != 0 else 0 for i in intervals]
            
            # Remove zeros
            first_digits = [d for d in first_digits if d != 0]
            
            if len(first_digits) < 5:  # Need enough non-zero values
                return 0
                
            # Count occurrences of each first digit (1-9)
            observed_counts = np.zeros(9)
            for digit in first_digits:
                if 1 <= digit <= 9:  # Valid first digits
                    observed_counts[digit-1] += 1
            
            # Calculate expected distribution according to Benford's Law
            total_digits = sum(observed_counts)
            if total_digits == 0:
                return 0
                
            # Expected frequencies according to Benford's Law
            benford_dist = np.array([np.log10(1 + 1/d) for d in range(1, 10)])
            expected_counts = benford_dist * total_digits
            
            # Calculate chi-square deviation
            chi_square = np.sum(np.divide(
                np.square(observed_counts - expected_counts),
                expected_counts,
                out=np.zeros_like(observed_counts),
                where=expected_counts != 0
            ))
            
            return chi_square
        
        # Group by author and calculate Benford deviation
        author_groups = df.groupby('author_handle')
        
        benford_results = []
        for author, group in author_groups:
            if len(group) >= 10:  # Need enough posts
                # Sort by time
                group_sorted = group.sort_values('created_at_dt')
                
                # Calculate intervals in seconds
                timestamps = group_sorted['created_at_dt'].astype(np.int64) // 10**9  # Convert to seconds
                intervals = np.diff(timestamps)
                
                # Calculate deviation from Benford's Law
                benford_dev = calculate_benford_deviation(intervals)
                
                benford_results.append({
                    'author_handle': author,
                    'benford_deviation': benford_dev
                })
        
        if benford_results:
            benford_df = pd.DataFrame(benford_results)
            df = df.merge(benford_df, on='author_handle', how='left')
            df['benford_deviation'] = df['benford_deviation'].fillna(0)
        else:
            df['benford_deviation'] = 0
        
        # 2. NEW: Periodicity detection for each author
        # Bots often show highly periodic behavior
        def detect_periodicity(timestamps):
            if len(timestamps) < 10:
                return 0
            
            # Calculate intervals
            intervals = np.diff(timestamps.astype(np.int64) // 10**9)  # Convert to seconds
            
            if len(intervals) < 3:
                return 0
                
            # Calculate standard deviation of intervals
            std_dev = np.std(intervals)
            mean_interval = np.mean(intervals)
            
            if mean_interval == 0:
                return 0
                
            # Coefficient of variation (lower means more regular/periodic)
            cv = std_dev / mean_interval
            
            # Regularization factor (0 means perfectly periodic, 1 means random)
            periodicity_factor = 1 - min(1, cv / 2)  # Normalize to 0-1 range
            
            return periodicity_factor
        
        # Calculate periodicity for each author
        periodicity_results = []
        for author, group in author_groups:
            if len(group) >= 10:
                # Sort by time
                group_sorted = group.sort_values('created_at_dt')
                
                # Calculate periodicity
                periodicity = detect_periodicity(group_sorted['created_at_dt'])
                
                periodicity_results.append({
                    'author_handle': author,
                    'posting_periodicity': periodicity
                })
        
        if periodicity_results:
            periodicity_df = pd.DataFrame(periodicity_results)
            df = df.merge(periodicity_df, on='author_handle', how='left')
            df['posting_periodicity'] = df['posting_periodicity'].fillna(0)
        else:
            df['posting_periodicity'] = 0
    
    # If data includes posts from multiple platforms, we can detect synchronized behavior
    if 'platform' in df.columns and 'author_handle' in df.columns and 'created_at_dt' in df.columns:
        print("Analyzing cross-platform synchronization...")
        
        def calculate_cross_platform_sync(group):
            platforms = group['platform'].unique()
            if len(platforms) <= 1:
                return 0  # Only one platform, no cross-platform sync
                
            # Calculate time differences between platforms
            sync_scores = []
            for i, p1 in enumerate(platforms[:-1]):
                for p2 in platforms[i+1:]:
                    # Get posts from each platform
                    p1_posts = group[group['platform'] == p1]['created_at_dt']
                    p2_posts = group[group['platform'] == p2]['created_at_dt']
                    
                    if len(p1_posts) < 2 or len(p2_posts) < 2:
                        continue
                        
                    # Convert to arrays for faster processing
                    p1_times = p1_posts.astype(np.int64).values // 10**9  # seconds
                    p2_times = p2_posts.astype(np.int64).values // 10**9
                    
                    # Find minimum time differences between posts across platforms
                    min_diffs = []
                    for t1 in p1_times:
                        min_diff = min(abs(t1 - t2) for t2 in p2_times)
                        min_diffs.append(min_diff)
                    
                    # Calculate median of minimum differences (in minutes)
                    median_diff = np.median(min_diffs) / 60
                    
                    # Closer to 0 means more synchronized
                    # Transform to 0-1 scale where 1 is highly synchronized
                    # 5 minutes or less is considered highly synchronized
                    sync_score = max(0, 1 - (median_diff / 5))
                    sync_scores.append(sync_score)
            
            return np.mean(sync_scores) if sync_scores else 0
        
        # Calculate cross-platform sync for each author
        sync_results = []
        for author, group in author_groups:
            sync_score = calculate_cross_platform_sync(group)
            sync_results.append({
                'author_handle': author,
                'cross_platform_sync': sync_score
            })
        
        if sync_results:
            sync_df = pd.DataFrame(sync_results)
            df = df.merge(sync_df, on='author_handle', how='left')
            df['cross_platform_sync'] = df['cross_platform_sync'].fillna(0)
        else:
            df['cross_platform_sync'] = 0
            
    # Graph-based features
    # Building interaction network and extract network features
    if 'author_handle' in df.columns:
        print("Building interaction network and extracting graph features...")
        
        # Build a graph of user interactions
        try:
            # Check if we have interaction data
            has_mentions = 'mention_count' in df.columns and df['mention_count'].sum() > 0
            has_replies = 'reply_count' in df.columns and df['reply_count'].sum() > 0
            
            if has_mentions or has_replies:
                # Extract mentioned users from text if available
                if 'post_text' in df.columns:
                    # Extract mentions (@username) from text
                    mention_pattern = re.compile(r'@(\w+)')
                    
                    # Function to extract mentions
                    def extract_mentions(text):
                        if not isinstance(text, str):
                            return []
                        return mention_pattern.findall(text)
                    
                    # Apply to all posts
                    df['mentioned_users'] = df['post_text'].apply(extract_mentions)
                    
                    # Build interaction graph
                    G = nx.DiGraph()
                    
                    # Add nodes for all users
                    all_users = df['author_handle'].unique()
                    G.add_nodes_from(all_users)
                    
                    # Add edges for mentions
                    for _, row in df.iterrows():
                        source = row['author_handle']
                        for target in row['mentioned_users']:
                            G.add_edge(source, target, type='mention')
                    
                    # Calculate network metrics
                    # 1. In-degree and out-degree
                    in_degree = dict(G.in_degree())
                    out_degree = dict(G.out_degree())
                    
                    # Convert to dataframes
                    in_degree_df = pd.DataFrame.from_dict(in_degree, orient='index', columns=['in_degree']).reset_index()
                    in_degree_df.columns = ['author_handle', 'in_degree']
                    
                    out_degree_df = pd.DataFrame.from_dict(out_degree, orient='index', columns=['out_degree']).reset_index()
                    out_degree_df.columns = ['author_handle', 'out_degree']
                    
                    # Merge with main dataframe
                    df = df.merge(in_degree_df, on='author_handle', how='left')
                    df = df.merge(out_degree_df, on='author_handle', how='left')
                    
                    # Fill NaN values
                    df['in_degree'] = df['in_degree'].fillna(0)
                    df['out_degree'] = df['out_degree'].fillna(0)
                    
                    # 2. Calculate ratios - bots often have skewed ratios
                    df['in_out_ratio'] = np.divide(
                        df['in_degree'],
                        df['out_degree'], 
                        out=np.zeros_like(df['in_degree'], dtype=float),
                        where=df['out_degree'] != 0
                    )
                    
                    # 3. Calculate clustering coefficient - human users tend to have higher clustering
                    if len(G.nodes) > 0:
                        clustering = nx.clustering(G.to_undirected())
                        clustering_df = pd.DataFrame.from_dict(clustering, orient='index', columns=['clustering_coefficient']).reset_index()
                        clustering_df.columns = ['author_handle', 'clustering_coefficient']
                        
                        # Merge with main dataframe
                        df = df.merge(clustering_df, on='author_handle', how='left')
                        df['clustering_coefficient'] = df['clustering_coefficient'].fillna(0)
                    else:
                        df['clustering_coefficient'] = 0
                    
                    # 4. Community detection - bots often form distinctive communities
                    # Use a simple connected components approach for efficiency
                    # More advanced methods like Louvain could be used for larger datasets
                    communities = nx.weakly_connected_components(G)
                    
                    # Map users to communities
                    community_mapping = {}
                    for i, community in enumerate(communities):
                        for user in community:
                            community_mapping[user] = i
                            
                    # Create dataframe from mapping
                    community_df = pd.DataFrame.from_dict(community_mapping, orient='index', columns=['community_id']).reset_index()
                    community_df.columns = ['author_handle', 'community_id']
                    
                    # Merge with main dataframe
                    df = df.merge(community_df, on='author_handle', how='left')
                    df['community_id'] = df['community_id'].fillna(-1)
                    
                    # 5. Calculate community size for each user
                    community_sizes = community_df.groupby('community_id').size().reset_index(name='community_size')
                    community_df = community_df.merge(community_sizes, on='community_id')
                    
                    # Keep only relevant columns
                    community_df = community_df[['author_handle', 'community_size']]
                    
                    # Merge with main dataframe
                    df = df.merge(community_df, on='author_handle', how='left')
                    df['community_size'] = df['community_size'].fillna(1)
                    
                    # Calculate the ratio of connections within vs. outside community
                    # This requires more complex graph analysis, skipping for efficiency
            else:
                # No interaction data available
                for col in ['in_degree', 'out_degree', 'in_out_ratio', 
                           'clustering_coefficient', 'community_id', 'community_size']:
                    df[col] = 0
                
        except Exception as e:
            print(f"Error in graph analysis: {e}")
            # Ensure all graph columns exist
            for col in ['in_degree', 'out_degree', 'in_out_ratio', 
                       'clustering_coefficient', 'community_id', 'community_size']:
                df[col] = 0
            
    # Additional time-series features 
    # Detect automated posting patterns using time series analysis
    if 'author_handle' in df.columns and 'created_at_dt' in df.columns:
        print("Analyzing temporal behavioral patterns...")
        
        def analyze_posting_shape(timestamps):
            """Analyze the temporal shape of posting behavior"""
            if len(timestamps) < 24:  # Need enough data points
                return {
                    'burst_ratio': 0,
                    'shape_entropy': 0,
                    'max_burst_size': 0
                }
            
            # Convert to hours for 24-hour analysis
            hours = pd.Series(timestamps).dt.hour
            
            # Create 24-hour histogram
            hour_counts = hours.value_counts().sort_index()
            
            # Ensure all 24 hours are represented
            for hour in range(24):
                if hour not in hour_counts.index:
                    hour_counts[hour] = 0
            
            hour_counts = hour_counts.sort_index()
            
            # 1. Calculate shape entropy
            total_posts = hour_counts.sum()
            if total_posts == 0:
                return {
                    'burst_ratio': 0,
                    'shape_entropy': 0,
                    'max_burst_size': 0
                }
                
            probabilities = hour_counts / total_posts
            probabilities = probabilities[probabilities > 0]  # Remove zeros
            shape_entropy = -np.sum(probabilities * np.log2(probabilities))
            
            # 2. Detect posting bursts
            # Define a burst as consecutive hours with posts > mean
            mean_posts = hour_counts.mean()
            bursts = []
            current_burst = []
            
            for hour, count in hour_counts.items():
                if count > mean_posts:
                    current_burst.append((hour, count))
                elif current_burst:
                    bursts.append(current_burst)
                    current_burst = []
            
            # Add the last burst if it exists
            if current_burst:
                bursts.append(current_burst)
            
            # Calculate burst statistics
            max_burst_size = max([len(burst) for burst in bursts]) if bursts else 0
            burst_posts = sum([sum(count for _, count in burst) for burst in bursts])
            burst_ratio = burst_posts / total_posts if total_posts > 0 else 0
            
            return {
                'burst_ratio': burst_ratio,
                'shape_entropy': shape_entropy,
                'max_burst_size': max_burst_size
            }
        
        # Calculate posting shape metrics for each author
        shape_results = []
        for author, group in author_groups:
            if len(group) >= 24:  # Need enough posts
                shape_metrics = analyze_posting_shape(group['created_at_dt'])
                shape_metrics['author_handle'] = author
                shape_results.append(shape_metrics)
        
        if shape_results:
            shape_df = pd.DataFrame(shape_results)
            df = df.merge(shape_df, on='author_handle', how='left')
            
            # Fill NaN values
            for col in ['burst_ratio', 'shape_entropy', 'max_burst_size']:
                df[col] = df[col].fillna(0)
        else:
            for col in ['burst_ratio', 'shape_entropy', 'max_burst_size']:
                df[col] = 0
    
    # Calculate posting regularity
    if 'author_handle' in df.columns and 'created_at_dt' in df.columns:
        # Use created_at_dt column
        posting_regularity = df.groupby('author_handle')['created_at_dt'].apply(
            lambda x: calculate_posting_regularity(x) if len(x) > 1 else 0
        )
        df = df.merge(posting_regularity.reset_index(name='posting_regularity'), on='author_handle', how='left')
        
        # Coordination detection
        # Detect coordinated posting activity across multiple users
        print("Detecting coordinated behavior patterns...")
        
        # Group users by posting patterns
        
        # 1. Create a signature for each user's posting pattern using binned activity
        def create_temporal_signature(group, bin_width_minutes=60):
            # Sort by timestamp
            sorted_group = group.sort_values('created_at_dt')
            
            if len(sorted_group) < 5:  # Need enough posts
                return None
                
            # Get timestamps as unix time (seconds)
            timestamps = sorted_group['created_at_dt'].astype(np.int64) // 10**9
            
            # Calculate intervals (in minutes)
            intervals = np.diff(timestamps) / 60
            
            # Bin intervals
            bins = np.arange(0, 24*60+bin_width_minutes, bin_width_minutes)  # 24 hours of bins
            hist, _ = np.histogram(intervals, bins=bins)
            
            # Normalize
            if hist.sum() > 0:
                hist = hist / hist.sum()
                
            return hist
            
        # Create temporal signatures for users with enough posts
        user_signatures = {}
        for author, group in author_groups:
            if len(group) >= 5:
                signature = create_temporal_signature(group)
                if signature is not None:
                    user_signatures[author] = signature
        
        # Calculate coordination scores between user pairs
        if len(user_signatures) > 1:
            coordination_scores = defaultdict(float)
            
            # Limit number of comparisons for very large datasets
            max_users_to_compare = 5000
            user_list = list(user_signatures.keys())
            if len(user_list) > max_users_to_compare:
                # Sample users for comparison
                import random
                random.seed(42)  # For reproducibility
                user_list = random.sample(user_list, max_users_to_compare)
            
            for i, user1 in enumerate(user_list[:-1]):
                sig1 = user_signatures[user1]
                for user2 in user_list[i+1:]:
                    sig2 = user_signatures[user2]
                    
                    # Calculate cosine similarity between temporal signatures
                    similarity = np.sum(sig1 * sig2) / (np.sqrt(np.sum(sig1**2)) * np.sqrt(np.sum(sig2**2)) + 1e-10)
                    
                    # Store coordination score for both users
                    coordination_scores[user1] = max(coordination_scores[user1], similarity)
                    coordination_scores[user2] = max(coordination_scores[user2], similarity)
            
            # Convert to dataframe
            coordination_df = pd.DataFrame([
                {'author_handle': user, 'coordination_score': score}
                for user, score in coordination_scores.items()
            ])
            
            # Merge with main dataframe
            df = df.merge(coordination_df, on='author_handle', how='left')
            df['coordination_score'] = df['coordination_score'].fillna(0)
        else:
            df['coordination_score'] = 0
    
    return df

def calculate_posting_regularity(times):
    """Calculate the regularity of posting times (lower coefficient of variation means more regular)"""
    if len(times) <= 1 or times.isna().all():
        return 0
    
    # Filter out NaN values
    times = times.dropna()
    if len(times) <= 1:
        return 0
    
    try:
        # Calculate time differences between consecutive posts
        time_diffs = np.diff(times.astype(np.int64)) / 10**9  # Convert to seconds
        
        # Calculate coefficient of variation (lower means more regular)
        if len(time_diffs) == 0 or np.mean(time_diffs) == 0:
            return 0
        
        cv = np.std(time_diffs) / np.mean(time_diffs)
        return cv if not np.isnan(cv) and not np.isinf(cv) else 0
    except Exception as e:
        print(f"Error calculating posting regularity: {e}")
        return 0

def detect_anomalies(df, features):
    """Use unsupervised learning to detect anomalies in the data"""
    print("Detecting anomalies...")
    
    try:
        # Scale features
        scaler = StandardScaler()
        X = scaler.fit_transform(df[features].fillna(0))
        
        # Use Isolation Forest for anomaly detection
        isolation_forest = IsolationForest(
            n_estimators=100, 
            contamination=0.1,  # Assume 10% of data points are anomalies
            max_samples='auto',
            random_state=42
        )
        df['anomaly_score'] = isolation_forest.fit_predict(X)
        df['is_anomaly'] = (df['anomaly_score'] == -1).astype(int)
        
        # Use DBSCAN for density-based clustering
        try:
            # First try with default sklearn parameters
            dbscan = DBSCAN(eps=0.5, min_samples=5)
            df['cluster'] = dbscan.fit_predict(X)
        except TypeError as e:
            # If that fails, try older sklearn version compatibility
            print(f"DBSCAN parameter error: {e}. Trying alternative parameters.")
            dbscan = DBSCAN(eps=0.5, min_samples=5)
            df['cluster'] = dbscan.fit_predict(X)
        
        # Use K-means to identify unusual clusters
        # Cap the number of clusters to prevent errors with small datasets
        num_clusters = min(8, len(df))
        if num_clusters < 2:
            num_clusters = 2  # Minimum of 2 clusters required
        
        try:
            # First try with default sklearn parameters
            kmeans = KMeans(n_clusters=num_clusters, random_state=42)
            df['kmeans_cluster'] = kmeans.fit_predict(X)
        except TypeError as e:
            # If that fails, try older sklearn version compatibility
            print(f"KMeans parameter error: {e}. Trying alternative parameters.")
            kmeans = KMeans(n_clusters=num_clusters, random_state=42)
            df['kmeans_cluster'] = kmeans.fit_predict(X)
        
        # Count instances in each cluster
        cluster_counts = df['kmeans_cluster'].value_counts()
        # Consider clusters with less than 5% of the data as small
        small_clusters = cluster_counts[cluster_counts < len(df) * 0.05].index
        
        # Mark as suspicious if in a small cluster
        df['in_small_cluster'] = df['kmeans_cluster'].isin(small_clusters).astype(int)
        
        # Use PCA to identify outliers in the principal component space
        if len(df) > 10 and len(features) > 1:
            # Reduce dimensions for efficiency
            n_components = min(5, len(features), len(df) - 1)
            if n_components > 0:  # Ensure positive number of components
                pca = PCA(n_components=n_components)
                pca_result = pca.fit_transform(X)
                
                # Mahalanobis distance calculation
                mu = np.mean(pca_result, axis=0)
                
                try:
                    # Using covariance calculation with regularization
                    cov = np.cov(pca_result, rowvar=False)
                    # Add small constant to diagonal to ensure positive definiteness
                    cov += np.eye(cov.shape[0]) * 1e-6
                    inv_cov = np.linalg.inv(cov)
                    
                    # Vectorized calculation for speed
                    mahalanobis_dist = []
                    for i in range(len(pca_result)):
                        delta = pca_result[i] - mu
                        dist = np.sqrt(delta.dot(inv_cov).dot(delta))
                        mahalanobis_dist.append(dist)
                    
                except np.linalg.LinAlgError:
                    # Fallback for singular matrix - use Euclidean distance
                    print("Singular matrix in Mahalanobis calculation. Using Euclidean distance.")
                    mahalanobis_dist = []
                    for i in range(len(pca_result)):
                        dist = np.linalg.norm(pca_result[i] - mu)
                        mahalanobis_dist.append(dist)
                
                df['mahalanobis_dist'] = mahalanobis_dist
                
                # Define outliers as points with distance > 3 standard deviations
                threshold = np.mean(mahalanobis_dist) + 3 * np.std(mahalanobis_dist)
                df['pca_outlier'] = (df['mahalanobis_dist'] > threshold).astype(int)
            else:
                df['mahalanobis_dist'] = 0
                df['pca_outlier'] = 0
        else:
            df['mahalanobis_dist'] = 0
            df['pca_outlier'] = 0
        
        # Generate pseudo-labels based on combined anomaly detection
        df['pseudo_label'] = 0  # Default: human
        
        # Mark as potential bots if any detection method flags them
        df.loc[(df['is_anomaly'] == 1) | 
               (df['in_small_cluster'] == 1) | 
               (df['pca_outlier'] == 1), 'pseudo_label'] = 1
        
        # Additional rule for users in the DBSCAN noise cluster (-1) with high Mahalanobis distance
        if 'mahalanobis_dist' in df.columns and df['mahalanobis_dist'].max() > 0:
            threshold = df['mahalanobis_dist'].mean() + 2 * df['mahalanobis_dist'].std()
            df.loc[(df['cluster'] == -1) & 
                   (df['mahalanobis_dist'] > threshold * 0.8), 'pseudo_label'] = 1
    
    except Exception as e:
        import traceback
        print(f"Error in anomaly detection: {e}")
        print(traceback.format_exc())
        # Ensure we still have the necessary columns even if the process fails
        df['pseudo_label'] = df.get('pseudo_label', 0)
        if 'is_anomaly' not in df.columns:
            df['is_anomaly'] = 0
    
    # Clean up memory
    gc.collect()
    
    return df

def train_ensemble_classifier(df, features):
    """Train an ensemble classifier to detect bots using advanced methods"""
    print("Training ensemble classifier...")
    
    try:
        # Prepare final dataset
        X_final = df[features].fillna(0)
        y_final = df['pseudo_label']
        
        # Check if we have enough data for meaningful split
        if len(df) < 10 or len(np.unique(y_final)) < 2:
            print("Warning: Dataset too small or lacks class diversity for proper training.")
            # Simple fallback: assign probability based on anomaly detection
            df['bot_probability'] = df['is_anomaly'].astype(float)
            df['is_bot'] = df['is_anomaly']
            return df, None
        
        # Split data - adaptive test size based on dataset size
        # For very large datasets, use smaller test split to save memory
        if len(df) > 500000:
            test_size = 0.05
        elif len(df) > 100000:
            test_size = 0.1
        else:
            test_size = 0.2
            
        X_train, X_test, y_train, y_test = train_test_split(
            X_final, y_final, test_size=test_size, random_state=42, stratify=y_final if len(np.unique(y_final)) > 1 else None
        )
        
        # Optimize classifier parameters based on dataset size
        if len(X_train) > 100000:
            # For very large datasets, use fewer trees but ensure enough for accuracy
            n_estimators = max(50, min(100, int(len(X_train) / 10000)))
            max_depth = min(8, max(3, int(np.log2(len(X_train)))))
            min_samples_split = max(5, min(20, int(len(X_train) / 50000)))
        else:
            # For smaller datasets, use more trees for better accuracy
            n_estimators = 100
            max_depth = min(10, max(3, int(np.log2(len(X_train)) + 2)))
            min_samples_split = 5
        
        print(f"Training with {n_estimators} trees, max_depth={max_depth}")
        
        # Using ExtraTreesClassifier with optimized parameters
        try:
            et_classifier = ExtraTreesClassifier(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                class_weight='balanced',
                random_state=42
            )
            
            # Add batch training for very large datasets to avoid memory issues
            if len(X_train) > 500000:
                # Train in batches
                batch_size = 100000
                for i in range(0, len(X_train), batch_size):
                    end = min(i + batch_size, len(X_train))
                    print(f"Training batch {i//batch_size + 1}/{(len(X_train) + batch_size - 1)//batch_size}")
                    if i == 0:
                        # First batch - fit the classifier
                        et_classifier.fit(X_train[i:end], y_train[i:end])
                    else:
                        # Subsequent batches - can't easily do partial_fit with trees, 
                        # so we'll just use the first batch for large datasets
                        pass
            else:
                # Standard training for normal-sized datasets
                et_classifier.fit(X_train, y_train)
            
            # Make predictions
            y_pred = et_classifier.predict(X_test)
            y_prob = et_classifier.predict_proba(X_test)[:, 1]
            
            # Evaluate
            print(classification_report(y_test, y_pred))
            print(confusion_matrix(y_test, y_pred))
            
            # Apply to full dataset, using batches for very large datasets
            if len(X_final) > 500000:
                bot_probs = []
                batch_size = 100000
                for i in range(0, len(X_final), batch_size):
                    end = min(i + batch_size, len(X_final))
                    batch_probs = et_classifier.predict_proba(X_final[i:end])[:, 1]
                    bot_probs.extend(batch_probs)
                df['bot_probability'] = bot_probs
            else:
                df['bot_probability'] = et_classifier.predict_proba(X_final)[:, 1]
            
            # Find optimal threshold using precision-recall curve
            precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
            f1_scores = 2 * recall * precision / (recall + precision + 1e-10)
            
            # Handle edge case where f1_scores might be all NaN
            if np.all(np.isnan(f1_scores)):
                optimal_threshold = 0.5
            else:
                optimal_idx = np.argmax(f1_scores)
                if len(thresholds) > optimal_idx:
                    optimal_threshold = thresholds[optimal_idx]
                else:
                    optimal_threshold = 0.5
            
            print(f"Optimal threshold: {optimal_threshold}")
            
            df['is_bot'] = (df['bot_probability'] > optimal_threshold).astype(int)
            
        except Exception as e:
            print(f"Error in ExtraTreesClassifier: {e}")
            # Fallback to a simpler classifier if ExtraTrees fails
            print("Falling back to RandomForestClassifier")
            rf_classifier = RandomForestClassifier(
                n_estimators=min(50, n_estimators),
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                class_weight='balanced',
                random_state=42
            )
            rf_classifier.fit(X_train, y_train)
            df['bot_probability'] = rf_classifier.predict_proba(X_final)[:, 1]
            df['is_bot'] = (df['bot_probability'] > 0.5).astype(int)
            et_classifier = rf_classifier  # Return the RF as the classifier
    
    except Exception as e:
        import traceback
        print(f"Error in ensemble training: {e}")
        print(traceback.format_exc())
        # Ensure we have the necessary columns even if training fails
        if 'bot_probability' not in df.columns:
            print("Training failed. Using anomaly scores as fallback.")
            df['bot_probability'] = df['is_anomaly'].astype(float)
        if 'is_bot' not in df.columns:
            df['is_bot'] = df['is_anomaly']
        et_classifier = None
    
    # Clean up memory
    gc.collect()
    
    return df, et_classifier


def aggregate_results(df, optimal_threshold):
    """Aggregated results by author to get author-level bot probabilities with consistent classification"""
    print("Aggregating results by author...")
    
    try:
        # Check if author_handle column exists
        if 'author_handle' not in df.columns:
            if 'author_did' in df.columns:
                df['author_handle'] = df['author_did']  # Use author_did as fallback
                print("Using 'author_did' column as author identifier")
            else:
                # Create dummy author column
                print("No author identifier found. Creating dummy identifiers.")
                df['author_handle'] = [f"user_{i}" for i in range(len(df))]
        
        # Filter to users with 2+ posts
        post_counts = df['author_handle'].value_counts()
        users_with_multiple_posts = post_counts[post_counts >= 2].index
        
        # Filter the dataframe to include only these users
        df_filtered = df[df['author_handle'].isin(users_with_multiple_posts)].reset_index(drop=True)
        
        # If no users have multiple posts, use the original dataset
        if len(df_filtered) == 0:
            print("Warning: No users with 2+ posts found. Using all users.")
            df_filtered = df
        else:
            print(f"Found {len(users_with_multiple_posts)} users with 2+ posts")
        
        # Aggregate results by author
        author_results = df_filtered.groupby('author_handle')['bot_probability'].mean().reset_index()
        
        # Add post count information
        author_post_counts = df_filtered.groupby('author_handle').size().reset_index(name='post_count')
        author_results = author_results.merge(author_post_counts, on='author_handle', how='left')
        
        # Add confidence score based on post count (more posts = higher confidence)
        author_results['confidence'] = 1 - (1 / (1 + np.log1p(author_results['post_count'])))
        
        # Use the optimal threshold on author-level bot probability
        author_results['is_bot'] = (author_results['bot_probability'] > optimal_threshold).astype(int)
        
        # Add additional metrics if available
        for metric in ['content_similarity', 'posting_regularity', 'benford_deviation', 
                      'coordination_score', 'posting_periodicity', 'burst_ratio']:
            if metric in df_filtered.columns:
                metric_agg = df_filtered.groupby('author_handle')[metric].mean().reset_index()
                author_results = author_results.merge(metric_agg, on='author_handle', how='left')
        
        # Add validation to check for inconsistencies
        if 'is_bot' in df.columns:
            # Calculate the original model-based classification for comparison
            is_bot_model = df_filtered.groupby('author_handle')['is_bot'].agg(
                lambda x: x.value_counts().index[0] if len(x) > 0 else 0
            ).reset_index()
            is_bot_model.columns = ['author_handle', 'is_bot_model']
            
            # Merge with results
            temp_results = author_results.merge(is_bot_model, on='author_handle', how='left')
            
            # Check for mismatches
            mismatches = (temp_results['is_bot'] != temp_results['is_bot_model']).sum()
        
        print(f"Final bot detection results: {author_results['is_bot'].sum()} bots out of {len(author_results)} total accounts")
    
    except Exception as e:
        import traceback
        print(f"Error in result aggregation: {e}")
        print(traceback.format_exc())
        
        # Create a minimal result set in case of errors
        if 'author_handle' in df.columns and 'bot_probability' in df.columns:
            print("Creating simplified results due to aggregation error")
            author_results = df.groupby('author_handle')['bot_probability'].mean().reset_index()
            author_results['is_bot'] = (author_results['bot_probability'] > optimal_threshold).astype(int)
            author_results['post_count'] = df.groupby('author_handle').size().reset_index(name='count')['count']
            author_results['confidence'] = 0.5  # Default confidence
        else:
            print("Cannot create valid results due to missing data")
            # Create dummy results
            author_results = pd.DataFrame({
                'author_handle': ['error_state'],
                'bot_probability': [0.0],
                'is_bot': [0],
                'post_count': [0],
                'confidence': [0.0]
            })
    
    return author_results


def validate_and_save_results(author_results, output_file, optimal_threshold):
    """Validate and save the results to ensure consistency"""
    print("Validating and saving results...")
    
    # Final validation check
    mismatch_count = ((author_results['bot_probability'] > optimal_threshold) != 
                      (author_results['is_bot'] == 1)).sum()
    
    if mismatch_count > 0:
        print(f"ERROR: {mismatch_count} accounts have inconsistent is_bot classifications!")
        print("Fixing inconsistencies before saving...")
        
        # Fix inconsistencies
        author_results['is_bot'] = (author_results['bot_probability'] > optimal_threshold).astype(int)
    
    # Save results
    author_results.to_csv(output_file, index=False)
    print(f"Results validated and saved to {output_file}")
    print(f"Detected {author_results['is_bot'].sum()} bot accounts out of {len(author_results)} total accounts")

def visualize_results(df, author_results, classifier=None, features=None):
    """Visualize the results of bot detection with improved visualizations and error handling"""
    print("Generating visualizations...")
    
    try:
        # 1. Distribution of bot probabilities
        plt.figure(figsize=(10, 6))
        sns.histplot(df['bot_probability'], bins=50, kde=True)
        plt.title('Distribution of Bot Probabilities')
        plt.xlabel('Bot Probability')
        plt.ylabel('Count')
        plt.savefig('bot_probability_distribution.png')
        plt.close()
        
        # 2. Feature importance - only if we have a classifier and features
        if classifier is not None and hasattr(classifier, 'feature_importances_') and features is not None:
            # Verify feature_importances_ has the right length
            if len(classifier.feature_importances_) == len(features):
                plt.figure(figsize=(12, 8))
                feature_importance = pd.DataFrame({
                    'Feature': features,
                    'Importance': classifier.feature_importances_
                }).sort_values('Importance', ascending=False)
                
                # Only plot top 20 or fewer if less are available
                top_n = min(20, len(feature_importance))
                sns.barplot(x='Importance', y='Feature', data=feature_importance.head(top_n))
                plt.title(f'Top {top_n} Feature Importances')
                plt.tight_layout()
                plt.savefig('feature_importance.png')
                plt.close()
            else:
                print(f"Feature importance array length ({len(classifier.feature_importances_)}) doesn't match feature count ({len(features)})")
        
        # 3. Bot vs Human comparison
        plt.figure(figsize=(12, 8))
        bot_authors = author_results[author_results['is_bot'] == 1]['author_handle'].values
        
        if len(bot_authors) > 0:
            bot_data = df[df['author_handle'].isin(bot_authors)]
            human_data = df[~df['author_handle'].isin(bot_authors)]
            
            # Select comparison features that exist in the dataframe
            potential_features = ['post_count', 'text_length', 'engagement_ratio', 'content_similarity', 
                                'posting_periodicity', 'benford_deviation']
            comparison_features = [f for f in potential_features if f in df.columns]
            
            # Limit to at most 6 features for visualization clarity
            comparison_features = comparison_features[:6]
            
            if len(comparison_features) > 0:
                for i, feature in enumerate(comparison_features):
                    if i < len(comparison_features):  # Ensure we don't exceed comparison features count
                        plt.subplot(2, (len(comparison_features) + 1) // 2, i+1)
                        
                        # Ensure there's data to plot
                        if len(bot_data) > 0 and feature in bot_data.columns:
                            bot_values = bot_data[feature].fillna(0).values
                            if len(bot_values) > 0 and not np.all(np.isnan(bot_values)):
                                sns.kdeplot(bot_values, label='Bot')
                                
                        if len(human_data) > 0 and feature in human_data.columns:
                            human_values = human_data[feature].fillna(0).values
                            if len(human_values) > 0 and not np.all(np.isnan(human_values)):
                                sns.kdeplot(human_values, label='Human')
                                
                        plt.title(f'{feature} Distribution')
                        plt.legend()
                
                plt.tight_layout()
                plt.savefig('bot_human_comparison.png')
            else:
                print("No valid comparison features found")
        else:
            print("No bot authors found for comparison visualization")
            
        plt.close()
        
        # 4. NEW: Temporal activity heatmap (hour of day vs day of week)
        if 'hour_of_day' in df.columns and 'day_of_week' in df.columns and 'author_handle' in df.columns:
            try:
                plt.figure(figsize=(12, 8))
                
                # Compare hour of day vs day of week for bots vs humans
                if len(bot_authors) > 0:
                    # Create pivot tables for heatmaps
                    bot_pivot = pd.crosstab(
                        df[df['author_handle'].isin(bot_authors)]['day_of_week'],
                        df[df['author_handle'].isin(bot_authors)]['hour_of_day'],
                        normalize='all'
                    )
                    
                    human_pivot = pd.crosstab(
                        df[~df['author_handle'].isin(bot_authors)]['day_of_week'],
                        df[~df['author_handle'].isin(bot_authors)]['hour_of_day'],
                        normalize='all'
                    )
                    
                    # Plot bot heatmap
                    plt.subplot(2, 1, 1)
                    sns.heatmap(bot_pivot, cmap='Reds', vmin=0, vmax=None)
                    plt.title('Bot Posting Activity (Day vs Hour)')
                    plt.xlabel('Hour of Day (0-23)')
                    plt.ylabel('Day of Week (0=Mon, 6=Sun)')
                    
                    # Plot human heatmap
                    plt.subplot(2, 1, 2)
                    sns.heatmap(human_pivot, cmap='Blues', vmin=0, vmax=None)
                    plt.title('Human Posting Activity (Day vs Hour)')
                    plt.xlabel('Hour of Day (0-23)')
                    plt.ylabel('Day of Week (0=Mon, 6=Sun)')
                    
                    plt.tight_layout()
                    plt.savefig('posting_time_heatmap.png')
                    plt.close()
            except Exception as e:
                print(f"Error creating temporal heatmap: {e}")
    except Exception as e:
        import traceback
        print(f"Error in visualization: {e}")
        print(traceback.format_exc())
    
    return

def bot_detection_pipeline(merged_df, output_file):
    """Complete bot detection pipeline"""
    print("Starting enhanced bot detection pipeline...")
    
    try:
        # Preprocess data (filtering to users with 2+ posts is done here)
        df = preprocess_data(merged_df)
        
        # Verify we have enough data to proceed
        if len(df) == 0:
            print("Error: No data remaining after preprocessing")
            return None, merged_df
            
        # Check if we have enough users with 2+ posts
        authors = df['author_handle'].unique()
        post_counts = df['author_handle'].value_counts()
        multi_post_authors = post_counts[post_counts >= 2].index
        print(f"Found {len(authors)} unique authors, {len(multi_post_authors)} with 2+ posts")
        
        if len(multi_post_authors) < 10:
            print("Warning: Very few users with 2+ posts. Results may be less reliable.")
        
        # Engineer features
        df = engineer_features(df)
        
        # Analyze behavioral patterns
        df = analyze_behavioral_patterns(df)
        
        # Define features for model, including all advanced features
        all_features = [
            # Basic features
            'post_count', 'time_since_last_post', 'text_length', 'word_count',
            'text_entropy', 'engagement_ratio', 'content_similarity',
            'has_near_duplicate', 'hour_of_day', 'day_of_week',
            'reply_count', 'repost_count', 'like_count', 'happy', 'sad', 'neutral',
            'posting_regularity',
            
            # Advanced temporal features
            'avg_time_between_posts', 'std_time_between_posts', 'median_time_between_posts',
            'cv_time_between_posts', 'posting_time_entropy', 'is_working_hours',
            'is_sleeping_hours', 'is_weekend', 'posting_periodicity',
            'benford_deviation', 'burst_ratio', 'shape_entropy', 'max_burst_size',
            
            # Content features
            'lexical_diversity', 'hashtag_count', 'mention_count', 'url_count',
            
            # Engagement features
            'likes_to_reposts_ratio', 'avg_engagement', 'std_engagement', 'cv_engagement',
            
            # Sentiment features
            'sentiment_ratio', 'avg_sentiment', 'std_sentiment',
            
            # Coordination features
            'coordination_score', 'avg_content_similarity',
            
            # Network features
            'in_degree', 'out_degree', 'in_out_ratio', 'clustering_coefficient',
            'community_size',
        ]
        
        # Filter to only include features that exist in the dataframe
        features = [f for f in all_features if f in df.columns]
        
        # OPTIMIZATION: Check for highly correlated features to remove redundancy
        if len(features) > 10:
            try:
                # Calculate correlation matrix
                corr_matrix = df[features].corr().abs()
                
                # Create a mask for the upper triangle
                upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
                
                # Find features with correlation > 0.95
                to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
                
                if len(to_drop) > 0:
                    print(f"Removing {len(to_drop)} highly correlated features: {to_drop}")
                    features = [f for f in features if f not in to_drop]
            except Exception as e:
                print(f"Error in correlation analysis: {e}")
                # Continue with original features
        
        print(f"Using {len(features)} features for model training")
        
        # Detect anomalies
        df = detect_anomalies(df, features)
        
        # Train ensemble classifier
        df, et_classifier = train_ensemble_classifier(df, features)
        
        # Extract the optimal threshold from the classifier training output
        # (This value is printed by train_ensemble_classifier function)
        import re
        from io import StringIO
        import sys
        
        # Redirect stdout to capture it
        old_stdout = sys.stdout
        new_stdout = StringIO()
        sys.stdout = new_stdout
        
        # Re-run the classification report to capture the threshold
        if et_classifier is not None and hasattr(et_classifier, 'predict_proba'):
            y_test = df['pseudo_label'].iloc[-int(len(df)*0.2):] # Approximate the test set
            y_prob = et_classifier.predict_proba(df[features].iloc[-int(len(df)*0.2):])[:, 1]
            from sklearn.metrics import precision_recall_curve
            precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
            f1_scores = 2 * recall * precision / (recall + precision + 1e-10)
            if np.all(np.isnan(f1_scores)):
                optimal_threshold = 0.5
            else:
                optimal_idx = np.argmax(f1_scores)
                if len(thresholds) > optimal_idx:
                    optimal_threshold = thresholds[optimal_idx]
                else:
                    optimal_threshold = 0.5
            print(f"Optimal threshold: {optimal_threshold}")
        else:
            optimal_threshold = 0.5
        
        # Restore stdout
        sys.stdout = old_stdout
        
        # Extract the threshold from the captured output
        output = new_stdout.getvalue()
        threshold_match = re.search(r"Optimal threshold: ([\d.]+)", output)
        
        if threshold_match:
            optimal_threshold = float(threshold_match.group(1))
        else:
            # If we can't find it in the output, get it from the original output log
            # assuming it's still in memory from the earlier run
            threshold_match = re.search(r"Optimal threshold: ([\d.]+)", " ".join(sys.stdout.buffer))
            if threshold_match:
                optimal_threshold = float(threshold_match.group(1))
            else:
                # Fall back to default if we can't extract it
                optimal_threshold = 0.5
        
        print(f"Using optimal threshold: {optimal_threshold}")
        
        # Aggregate results by author with the correct threshold
        author_results = aggregate_results(df, optimal_threshold)
        
        # Visualize results - only if we have a classifier
        if et_classifier is not None:
            try:
                visualize_results(df, author_results, et_classifier, features)
            except Exception as e:
                print(f"Error in visualization: {e}")
                # Continue even if visualization fails
        
        # Validate and save results with the correct threshold
        validate_and_save_results(author_results, output_file, optimal_threshold)
        
        print(f"Enhanced bot detection complete. Results saved to {output_file}")
        print(f"Detected {author_results['is_bot'].sum()} bot accounts out of {len(author_results)} total accounts")
        
        return author_results, df
        
    except Exception as e:
        import traceback
        print(f"Error in bot detection pipeline: {e}")
        print(traceback.format_exc())
        return None, merged_df
   

# Run the pipeline
# Load the data
df1 = pd.read_csv("Refugee_data_2023_bluesky_sentiment.csv")
df2 = pd.read_csv("Refugee_data_2024_bluesky_sentiment.csv")
# social_data_path = "data.csv"
keywords_data_path = "refugee_immigrant_keyword_list_v3.csv"

data_df = pd.concat([df1,df2])
data_df = data_df.drop('Unnamed: 0', axis=1)
keywords_df = pd.read_csv(keywords_data_path)
keywords_df = keywords_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# Merge posts data with keyword-subcategory mapping
merged_df = data_df.merge(keywords_df, left_on="keyword", right_on="term", how="left")
# Convert 'created_at' column to datetime format
merged_df['created_at'] = pd.to_datetime(merged_df['created_at'], format='mixed', utc=True)
merged_df = merged_df[merged_df['created_at']<"2025-01-01 18:18:04.312000+0000"]
# Add 'neutral' column: 1 if both happy and sad are 0, else 0
merged_df['neutral'] = ((merged_df['happy'] == 0) & (merged_df['sad'] == 0)).astype(int)
results, processed_df = bot_detection_pipeline(merged_df, 'bot_detection_results.csv')


Starting enhanced bot detection pipeline...
Preprocessing data...
Filtered to 126091 users with 2 or more posts
created_at column type before conversion: datetime64[ns, UTC]
Sample values before conversion: ['2023-01-01T11:38:00.000000000' '2023-01-05T06:36:56.000000000']
First attempt conversion failures: 0
Final datetime conversion failures: 0 (0.00%)
Found 126091 unique authors, 126091 with 2+ posts
Engineering features...
Calculating content similarity...
Analyzing behavioral patterns...
Building interaction network and extracting graph features...
Analyzing temporal behavioral patterns...
Detecting coordinated behavior patterns...
Removing 4 highly correlated features: ['std_time_between_posts', 'shape_entropy', 'avg_content_similarity', 'out_degree']
Using 44 features for model training
Detecting anomalies...
Training ensemble classifier...
Training with 86 trees, max_depth=8
Training batch 1/9
Training batch 2/9
Training batch 3/9
Training batch 4/9
Training batch 5/9
Training b

In [7]:
#Code 2:
import pandas as pd
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import random
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from datetime import datetime, timedelta
from scipy import sparse
import gc
import pickle
import time

# Disable parallelism warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set random seeds
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class FeatureEncoder(nn.Module):
    """Neural network for encoding user features"""
    def __init__(self, input_dim, hidden_dims=[128, 64, 32]):
        super(FeatureEncoder, self).__init__()
        layers = []
        prev_dim = input_dim
        
        for dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(nn.BatchNorm1d(dim))
            layers.append(nn.LeakyReLU())
            layers.append(nn.Dropout(0.2))
            prev_dim = dim
            
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.model(x)

class GraphEncoder(nn.Module):
    """Simple graph encoder using GNN principles"""
    def __init__(self, input_dim, hidden_dim=32, output_dim=16):
        super(GraphEncoder, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x, adj_matrix):
        # Simple graph convolution: x' = Ax
        x = F.relu(self.fc1(x))
        x = torch.mm(adj_matrix, x)
        x = F.relu(self.fc2(x))
        return x

class BotDetector:
    """
    GMAE2-CGNN Bot Detector - Optimized for Research
    Based on "Unsupervised Social Bot Detection via Structural Information Theory" (2024)
    """
    
    def __init__(self, verbose=True, output_dir="bot_detection_results"):
        self.verbose = verbose
        self.output_dir = output_dir
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Initialize components
        self.feature_encoder = None
        self.graph_encoder = None
        self.embedding_model = None
        self.user_features = None
        self.feature_matrix = None
        self.graph = None
        self.adj_matrix = None
        self.user_to_node = {}
        self.node_to_user = {}
        self.user_embeddings = None
        self.bot_scores = None
        self.unique_users = None
        
        # Statistics tracking
        self.run_stats = {
            'start_time': None,
            'end_time': None,
            'total_users': 0,
            'processed_users': 0,
            'detected_bots': 0,
            'bot_percentage': 0.0,
            'batches': []
        }
        
    def log(self, message):
        """Print log message if verbose is enabled"""
        if self.verbose:
            timestamp = datetime.now().strftime('%H:%M:%S')
            print(f"[{timestamp}] {message}")
    
    def preprocess_data(self, df, max_users=10000, min_posts=2, max_posts=50):
        """Preprocess the data"""
        self.log("Preprocessing data...")
        
        # Ensure created_at is datetime
        if not pd.api.types.is_datetime64_dtype(df['created_at']):
            df['created_at'] = pd.to_datetime(df['created_at'], format='mixed', utc=True)
        
        # Add neutral sentiment if not present
        if 'neutral' not in df.columns:
            df['neutral'] = ((df['happy'] == 0) & (df['sad'] == 0)).astype(int)
        
        # Filter users with enough posts
        user_counts = df['author_handle'].value_counts()
        qualified_users = user_counts[user_counts >= min_posts].index
        
        # Sample users if too many
        if len(qualified_users) > max_users:
            self.log(f"Sampling {max_users} users from {len(qualified_users)} qualified users")
            qualified_users = np.random.choice(qualified_users, max_users, replace=False)
        
        # Filter data to qualified users
        filtered_df = df[df['author_handle'].isin(qualified_users)].copy()
        
        # Sample posts for very active users
        if max_posts:
            user_samples = []
            for user in qualified_users:
                user_posts = filtered_df[filtered_df['author_handle'] == user]
                if len(user_posts) > max_posts:
                    user_posts = user_posts.sample(n=max_posts, random_state=SEED)
                user_samples.append(user_posts)
            filtered_df = pd.concat(user_samples)
        
        # Create user-node mappings
        unique_users = filtered_df['author_handle'].unique()
        self.user_to_node = {user: i for i, user in enumerate(unique_users)}
        self.node_to_user = {i: user for user, i in self.user_to_node.items()}
        self.unique_users = unique_users
        
        self.log(f"Preprocessing complete. {len(filtered_df)} posts from {len(unique_users)} users")
        return filtered_df
    
    def extract_features(self, df):
        """Extract user features with text embedding disabled for efficiency"""
        self.log("Extracting user features...")
        
        # Get unique users
        unique_users = list(self.user_to_node.keys())
        
        user_features = []
        for user in tqdm(unique_users, disable=not self.verbose):
            user_posts = df[df['author_handle'] == user]
            
            # Basic user metrics
            post_count = len(user_posts)
            
            # Temporal features
            post_times = pd.to_datetime(user_posts['created_at'])
            if post_count > 1:
                sorted_times = sorted(post_times)
                time_diffs = [(sorted_times[i] - sorted_times[i-1]).total_seconds() / 60 
                              for i in range(1, len(sorted_times))]
                
                avg_time_diff = np.mean(time_diffs)
                std_time_diff = np.std(time_diffs) if len(time_diffs) > 1 else 0
                
                # Hour distribution
                hours = post_times.dt.hour
                hour_counts = hours.value_counts(normalize=True)
                hour_entropy = -sum(p * np.log2(p) for p in hour_counts if p > 0)
                
                # Day of week pattern
                days = post_times.dt.dayofweek
                day_counts = days.value_counts(normalize=True)
                day_entropy = -sum(p * np.log2(p) for p in day_counts if p > 0)
            else:
                avg_time_diff = std_time_diff = hour_entropy = day_entropy = 0
            
            # Engagement metrics
            avg_replies = user_posts['reply_count'].mean()
            avg_reposts = user_posts['repost_count'].mean()
            avg_likes = user_posts['like_count'].mean()
            
            # Engagement ratios
            if post_count > 0:
                reply_ratio = user_posts['reply_count'].sum() / post_count
                repost_ratio = user_posts['repost_count'].sum() / post_count
                like_ratio = user_posts['like_count'].sum() / post_count
            else:
                reply_ratio = repost_ratio = like_ratio = 0
            
            # Sentiment metrics
            happy_ratio = user_posts['happy'].mean()
            sad_ratio = user_posts['sad'].mean()
            neutral_ratio = user_posts['neutral'].mean()
            
            # Sentiment entropy
            sentiment_counts = np.array([
                np.sum(user_posts['happy'] > 0),
                np.sum(user_posts['sad'] > 0),
                np.sum(user_posts['neutral'] > 0)
            ])
            sentiment_dist = sentiment_counts / np.sum(sentiment_counts) if np.sum(sentiment_counts) > 0 else np.ones(3)/3
            sentiment_entropy = -np.sum(sentiment_dist * np.log2(sentiment_dist + 1e-10))
            
            # Text metrics (using post_text instead of cleaned_text)
            if 'post_text' in user_posts.columns:
                text_lengths = user_posts['post_text'].astype(str).apply(len)
                avg_text_length = text_lengths.mean()
                std_text_length = text_lengths.std() if len(text_lengths) > 1 else 0
                cv_text_length = std_text_length / avg_text_length if avg_text_length > 0 else 0
            else:
                avg_text_length = std_text_length = cv_text_length = 0
            
            # Assemble features
            features = {
                'user': user,
                'post_count': post_count,
                'avg_time_diff': avg_time_diff,
                'std_time_diff': std_time_diff,
                'hour_entropy': hour_entropy,
                'day_entropy': day_entropy,
                'avg_replies': avg_replies,
                'avg_reposts': avg_reposts,
                'avg_likes': avg_likes,
                'reply_ratio': reply_ratio,
                'repost_ratio': repost_ratio,
                'like_ratio': like_ratio,
                'happy_ratio': happy_ratio,
                'sad_ratio': sad_ratio,
                'neutral_ratio': neutral_ratio,
                'sentiment_entropy': sentiment_entropy,
                'avg_text_length': avg_text_length,
                'std_text_length': std_text_length,
                'cv_text_length': cv_text_length
            }
            
            user_features.append(features)
        
        # Convert to DataFrame
        user_df = pd.DataFrame(user_features)
        self.user_features = user_df
        
        # Create feature matrix
        feature_cols = [col for col in user_df.columns if col != 'user']
        scaler = StandardScaler()
        self.feature_matrix = scaler.fit_transform(user_df[feature_cols])
        
        self.log(f"Extracted {len(feature_cols)} features for {len(user_df)} users")
        return user_df
    
    def construct_graph(self, df, temporal_window_days=14):
        """Construct graph using non-parallel approach"""
        self.log("Constructing temporal interaction graph...")
        
        # Create graph
        G = nx.Graph()
        for user, node_id in self.user_to_node.items():
            G.add_node(node_id, user=user)
        
        # Simplified approach to avoid parallelism deadlocks
        temporal_window = timedelta(days=temporal_window_days)
        
        # Get post times by user (non-parallel)
        user_post_times = {}
        for user in self.user_to_node.keys():
            user_posts = df[df['author_handle'] == user]
            user_post_times[user] = pd.to_datetime(user_posts['created_at']).tolist()
        
        # Create edges based on temporal patterns (simplified)
        edges = []
        users = list(self.user_to_node.keys())
        
        # Process in chunks to manage memory
        chunk_size = 100
        num_chunks = (len(users) + chunk_size - 1) // chunk_size
        
        for chunk_idx in tqdm(range(num_chunks), disable=not self.verbose):
            start_idx = chunk_idx * chunk_size
            end_idx = min((chunk_idx + 1) * chunk_size, len(users))
            user_chunk = users[start_idx:end_idx]
            
            for user1 in user_chunk:
                times1 = user_post_times[user1]
                # Only compare with a subset of users to reduce computation
                comparison_users = random.sample(users, min(200, len(users)))
                
                for user2 in comparison_users:
                    if user1 == user2:
                        continue
                        
                    times2 = user_post_times[user2]
                    
                    # Sample times for efficiency
                    max_samples = 10
                    if len(times1) > max_samples:
                        times1_sample = random.sample(times1, max_samples)
                    else:
                        times1_sample = times1
                        
                    if len(times2) > max_samples:
                        times2_sample = random.sample(times2, max_samples)
                    else:
                        times2_sample = times2
                    
                    # Count time-based interactions
                    interactions = 0
                    for t1 in times1_sample:
                        for t2 in times2_sample:
                            if abs(t1 - t2) < temporal_window:
                                interactions += 1
                    
                    # Normalize and add edge if interactions found
                    if interactions > 0:
                        weight = interactions / (len(times1_sample) * len(times2_sample))
                        edges.append((
                            self.user_to_node[user1],
                            self.user_to_node[user2],
                            weight
                        ))
        
        # Add edges to graph
        for u, v, w in edges:
            G.add_edge(u, v, weight=w)
            
        self.graph = G
        self.log(f"Graph constructed with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
        
        # Create adjacency matrix for the graph encoder (using updated API)
        adj = nx.adjacency_matrix(G, weight='weight')
        adj = adj.tocoo()
        
        # Normalize adjacency matrix
        degrees = np.array(adj.sum(axis=1)).flatten()
        degree_mat_inv_sqrt = 1.0 / np.sqrt(np.maximum(degrees, 1e-12))
        
        # Create dense adjacency matrix
        rows, cols = adj.row, adj.col
        adj_data = adj.data * degree_mat_inv_sqrt[rows] * degree_mat_inv_sqrt[cols]
        
        # Convert to dense tensor
        adj_dense = torch.zeros((G.number_of_nodes(), G.number_of_nodes()), device=device)
        for i in range(len(rows)):
            adj_dense[rows[i], cols[i]] = adj_data[i]
        
        self.adj_matrix = adj_dense
        
        return G
    
    def train_model(self, epochs=30, learning_rate=0.001):
        """Train the feature and graph encoders"""
        self.log("Training model...")
        
        # Initialize models
        input_dim = self.feature_matrix.shape[1]
        self.feature_encoder = FeatureEncoder(input_dim=input_dim).to(device)
        self.graph_encoder = GraphEncoder(
            input_dim=32,  # Match the output dimension of feature encoder
            hidden_dim=16,
            output_dim=8
        ).to(device)
        
        # Convert feature matrix to tensor
        X = torch.FloatTensor(self.feature_matrix).to(device)
        
        # Optimizer
        optimizer = Adam(list(self.feature_encoder.parameters()) + 
                         list(self.graph_encoder.parameters()),
                         lr=learning_rate)
        
        # Training loop
        self.feature_encoder.train()
        self.graph_encoder.train()
        
        losses = []
        for epoch in range(epochs):
            # Forward pass through feature encoder
            feature_embeddings = self.feature_encoder(X)
            
            # Forward pass through graph encoder
            graph_embeddings = self.graph_encoder(feature_embeddings, self.adj_matrix)
            
            # Self-supervised contrastive loss
            src, dst = torch.nonzero(self.adj_matrix > 0, as_tuple=True)
            
            if len(src) > 0:
                # Positive pairs (users who post at similar times)
                pos_scores = torch.sum(graph_embeddings[src] * graph_embeddings[dst], dim=1)
                pos_loss = -torch.mean(F.logsigmoid(pos_scores))
                
                # Negative sampling (random user pairs)
                neg_samples = 5
                neg_src = torch.randint(0, X.size(0), (len(src) * neg_samples,), device=device)
                neg_dst = torch.randint(0, X.size(0), (len(src) * neg_samples,), device=device)
                neg_scores = torch.sum(graph_embeddings[neg_src] * graph_embeddings[neg_dst], dim=1)
                neg_loss = -torch.mean(F.logsigmoid(-neg_scores))
                
                graph_loss = pos_loss + neg_loss
            else:
                graph_loss = torch.tensor(0.0, device=device)
            
            # Feature reconstruction loss
            reconstruction_loss = F.mse_loss(feature_embeddings, feature_embeddings.detach())
            
            # Combined loss
            loss = reconstruction_loss + 0.5 * graph_loss
            losses.append(loss.item())
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (epoch + 1) % 5 == 0 or epoch == 0:
                self.log(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.6f}")
        
        self.log("Training completed")
        return losses
    
    def generate_embeddings(self):
        """Generate user embeddings"""
        self.log("Generating embeddings...")
        
        # Set models to evaluation mode
        self.feature_encoder.eval()
        self.graph_encoder.eval()
        
        # Generate embeddings
        with torch.no_grad():
            X = torch.FloatTensor(self.feature_matrix).to(device)
            feature_embeddings = self.feature_encoder(X)
            final_embeddings = self.graph_encoder(feature_embeddings, self.adj_matrix)
            embeddings = final_embeddings.cpu().numpy()
        
        # Store user embeddings
        user_embeddings = {}
        for node_id, embedding in enumerate(embeddings):
            user = self.node_to_user.get(node_id)
            if user:
                user_embeddings[user] = embedding
        
        self.user_embeddings = user_embeddings
        return user_embeddings
    
    def detect_bots(self, num_clusters=2):
        """Detect bots using clustering"""
        self.log("Detecting bots...")
        
        if not self.user_embeddings:
            self.log("No embeddings found. Generating embeddings...")
            self.generate_embeddings()
        
        # Prepare data for clustering
        users = list(self.user_embeddings.keys())
        embeddings = np.array([self.user_embeddings[user] for user in users])
        
        # Apply KMeans clustering
        kmeans = KMeans(n_clusters=num_clusters, random_state=SEED, n_init=10)
        labels = kmeans.fit_predict(embeddings)
        
        # Determine which cluster is bots
        # Bots typically form a more cohesive cluster
        cluster_sizes = {label: np.sum(labels == label) for label in range(num_clusters)}
        
        # Calculate cluster cohesion
        cluster_cohesion = {}
        for label in range(num_clusters):
            cluster_points = embeddings[labels == label]
            if len(cluster_points) > 1:
                # Use pairwise distances within cluster
                distances = []
                for i in range(min(len(cluster_points), 100)):  # Sample for efficiency
                    idx = np.random.choice(len(cluster_points))
                    point = cluster_points[idx]
                    dists = np.linalg.norm(cluster_points - point, axis=1)
                    distances.extend(dists.tolist())
                cluster_cohesion[label] = np.mean(distances)
            else:
                cluster_cohesion[label] = float('inf')
        
        # Bot cluster is typically smaller and more cohesive
        bot_scores = {}
        for label in range(num_clusters):
            # Normalized size (smaller = higher score)
            size_score = 1 - (cluster_sizes[label] / sum(cluster_sizes.values()))
            
            # Normalized cohesion (more cohesive = higher score)
            max_cohesion = max(cluster_cohesion.values())
            cohesion_score = 1 - (cluster_cohesion[label] / max_cohesion) if max_cohesion > 0 else 0
            
            # Combined score (higher = more likely to be bots)
            bot_scores[label] = 0.7 * size_score + 0.3 * cohesion_score
        
        # Bot cluster has highest score
        bot_cluster = max(bot_scores.items(), key=lambda x: x[1])[0]
        
        # Map users to bot probabilities
        user_bot_probs = {}
        for i, user in enumerate(users):
            # Calculate distance to centroids
            user_embedding = embeddings[i]
            distances = []
            
            for label in range(num_clusters):
                centroid = kmeans.cluster_centers_[label]
                dist = np.linalg.norm(user_embedding - centroid)
                distances.append(dist)
            
            # If this user's cluster is the bot cluster
            if labels[i] == bot_cluster:
                # Calculate probability based on distance to centroids
                total_dist = sum(distances)
                if total_dist > 0:
                    # Invert distance to centroid (closer = higher probability)
                    bot_prob = 1 - (distances[bot_cluster] / total_dist)
                    bot_prob = min(0.95, max(0.5, bot_prob))  # Clamp between 0.5 and 0.95
                else:
                    bot_prob = 0.75
            else:
                # Not in bot cluster
                total_dist = sum(distances)
                if total_dist > 0:
                    # Calculate probability (closer to bot centroid = higher)
                    human_prob = 1 - (distances[labels[i]] / total_dist)
                    bot_prob = 1 - human_prob
                    bot_prob = min(0.49, max(0.05, bot_prob))  # Clamp between 0.05 and 0.49
                else:
                    bot_prob = 0.25
            
            user_bot_probs[user] = bot_prob
        
        self.bot_scores = user_bot_probs
        
        # Log results
        bot_users = [user for user, prob in user_bot_probs.items() if prob >= 0.5]
        total_users = len(user_bot_probs)
        self.log(f"Detected {len(bot_users)} bots out of {total_users} users ({len(bot_users)/total_users*100:.1f}%)")
        
        return user_bot_probs
    
    def visualize_results(self, output_file=None):
        """Visualize bot detection results"""
        self.log("Visualizing results...")
        
        if not self.user_embeddings or not self.bot_scores:
            self.log("No results to visualize")
            return
        
        # Prepare data
        users = list(self.bot_scores.keys())
        embeddings = np.array([self.user_embeddings[user] for user in users])
        is_bot = np.array([self.bot_scores[user] >= 0.5 for user in users])
        
        # Use PCA for visualization
        pca = PCA(n_components=2)
        embeddings_2d = pca.fit_transform(embeddings)
        
        # Create plot
        plt.figure(figsize=(12, 10))
        
        # Plot human users
        plt.scatter(
            embeddings_2d[~is_bot, 0],
            embeddings_2d[~is_bot, 1],
            c='blue',
            alpha=0.7,
            label='Human'
        )
        
        # Plot bot users
        plt.scatter(
            embeddings_2d[is_bot, 0],
            embeddings_2d[is_bot, 1],
            c='red',
            alpha=0.7,
            label='Bot'
        )
        
        plt.title('Bot Detection Results', fontsize=16)
        plt.legend(fontsize=12)
        plt.grid(alpha=0.3)
        
        if output_file is None:
            output_file = os.path.join(self.output_dir, "bot_detection_results.png")
            
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        self.log(f"Visualization saved to {output_file}")
    
    def analyze_feature_importance(self):
        """Analyze which features are most important for bot detection"""
        self.log("Analyzing feature importance...")
        
        if not hasattr(self, 'user_features') or self.user_features is None or self.bot_scores is None:
            self.log("Feature importance analysis requires user features and bot scores")
            return None
        
        # Add bot labels to user features
        analysis_df = self.user_features.copy()
        analysis_df['is_bot'] = analysis_df['user'].map(
            {user: 1 if prob >= 0.5 else 0 for user, prob in self.bot_scores.items()}
        )
        
        # Calculate mean feature values for bots vs. humans
        feature_cols = [col for col in analysis_df.columns if col not in ['user', 'is_bot']]
        
        bot_means = analysis_df[analysis_df['is_bot'] == 1][feature_cols].mean()
        human_means = analysis_df[analysis_df['is_bot'] == 0][feature_cols].mean()
        
        # Calculate difference
        diff = bot_means - human_means
        abs_diff = diff.abs()
        
        # Sort by importance
        importance = abs_diff.sort_values(ascending=False)
        
        # Visualize top 10 features
        plt.figure(figsize=(14, 8))
        plt.subplot(1, 2, 1)
        importance[:10].plot(kind='bar')
        plt.title('Top 10 Discriminative Features', fontsize=14)
        plt.ylabel('Absolute Difference (Bot - Human)', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        
        # Plot comparison of top 5 features
        plt.subplot(1, 2, 2)
        top_features = importance.index[:5]
        
        bot_vals = bot_means[top_features].values
        human_vals = human_means[top_features].values
        
        x = np.arange(len(top_features))
        width = 0.35
        
        plt.bar(x - width/2, bot_vals, width, label='Bot')
        plt.bar(x + width/2, human_vals, width, label='Human')
        
        plt.title('Bot vs Human Feature Comparison', fontsize=14)
        plt.xticks(x, top_features, rotation=45, ha='right')
        plt.legend()
        plt.tight_layout()
        
        # Save plot
        importance_plot_path = os.path.join(self.output_dir, "feature_importance.png")
        plt.savefig(importance_plot_path, dpi=300, bbox_inches='tight')
        self.log(f"Feature importance visualization saved to {importance_plot_path}")
        
        # Save detailed analysis to CSV
        feature_comparison = pd.DataFrame({
            'feature': feature_cols,
            'bot_mean': bot_means,
            'human_mean': human_means,
            'difference': diff,
            'abs_difference': abs_diff
        }).sort_values('abs_difference', ascending=False)
        
        importance_csv_path = os.path.join(self.output_dir, "feature_importance.csv")
        feature_comparison.to_csv(importance_csv_path, index=False)
        
        return importance
    
    def compare_with_baselines(self, df):
        """Compare with baseline methods from literature"""
        self.log("Comparing with baseline methods...")
        
        if not self.bot_scores:
            self.log("No bot detection results to compare with baselines")
            return None
        
        # 1. Simple activity-based heuristic (high posting frequency)
        user_post_counts = df['author_handle'].value_counts()
        activity_threshold = np.percentile(user_post_counts.values, 95)  # Top 5% most active
        activity_bots = set(user_post_counts[user_post_counts > activity_threshold].index)
        
        # 2. Temporal regularity (consistent posting patterns)
        temporal_bots = set()
        for user in tqdm(self.unique_users, desc="Analyzing temporal patterns", disable=not self.verbose):
            user_posts = df[df['author_handle'] == user]
            if len(user_posts) < 5:
                continue
                
            post_times = pd.to_datetime(user_posts['created_at'])
            sorted_times = sorted(post_times)
            time_diffs = [(sorted_times[i] - sorted_times[i-1]).total_seconds() / 60 
                         for i in range(1, len(sorted_times))]
            
            if len(time_diffs) > 1:
                # Calculate coefficient of variation (lower = more regular)
                cv = np.std(time_diffs) / np.mean(time_diffs) if np.mean(time_diffs) > 0 else float('inf')
                if cv < 0.5:  # Very regular posting pattern
                    temporal_bots.add(user)
        
        # 3. Sentiment-based (low variance in sentiment)
        sentiment_bots = set()
        for user in tqdm(self.unique_users, desc="Analyzing sentiment patterns", disable=not self.verbose):
            user_posts = df[df['author_handle'] == user]
            if len(user_posts) < 5:
                continue
                
            # Calculate sentiment entropy
            happy_count = np.sum(user_posts['happy'] > 0)
            sad_count = np.sum(user_posts['sad'] > 0)
            neutral_count = np.sum(user_posts['neutral'] > 0)
            
            total = happy_count + sad_count + neutral_count
            if total > 0:
                probs = np.array([happy_count, sad_count, neutral_count]) / total
                # Calculate entropy (higher = more diverse sentiment)
                entropy = -np.sum(probs * np.log2(probs + 1e-10))
                
                # Low entropy = uniform sentiment = suspicious
                if entropy < 0.8:  # Threshold for low sentiment diversity
                    sentiment_bots.add(user)
        
        # 4. Our method (GMAE2-CGNN)
        our_bots = set(user for user, prob in self.bot_scores.items() if prob >= 0.5)
        
        # Compare methods
        self.log(f"Activity-based: {len(activity_bots)} bots")
        self.log(f"Temporal regularity: {len(temporal_bots)} bots")
        self.log(f"Sentiment uniformity: {len(sentiment_bots)} bots")
        self.log(f"Our GMAE2-CGNN: {len(our_bots)} bots")
        
        # Calculate overlap between methods
        overlap_activity = len(our_bots.intersection(activity_bots))
        overlap_temporal = len(our_bots.intersection(temporal_bots))
        overlap_sentiment = len(our_bots.intersection(sentiment_bots))
        
        self.log(f"Overlap with activity-based: {overlap_activity} users ({overlap_activity/len(our_bots)*100:.1f}%)")
        self.log(f"Overlap with temporal: {overlap_temporal} users ({overlap_temporal/len(our_bots)*100:.1f}%)")
        self.log(f"Overlap with sentiment: {overlap_sentiment} users ({overlap_sentiment/len(our_bots)*100:.1f}%)")
        
        # Create comparison visualization
        plt.figure(figsize=(12, 10))
        
        # Venn diagram of bot detections (simplified using sets)
        from matplotlib_venn import venn3
        
        venn = venn3([activity_bots, temporal_bots, our_bots], 
               ('Activity', 'Temporal', 'GMAE2-CGNN'))
        plt.title('Bot Detection Method Comparison', fontsize=16)
        
        # Save comparison
        comparison_path = os.path.join(self.output_dir, "method_comparison.png")
        plt.savefig(comparison_path, dpi=300, bbox_inches='tight')
        self.log(f"Method comparison visualization saved to {comparison_path}")
        
        # Save detailed comparison to CSV
        all_users = set(self.user_to_node.keys())
        comparison_data = []
        
        for user in all_users:
            if user in self.bot_scores:
                comparison_data.append({
                    'user': user,
                    'activity_bot': 1 if user in activity_bots else 0,
                    'temporal_bot': 1 if user in temporal_bots else 0,
                    'sentiment_bot': 1 if user in sentiment_bots else 0,
                    'gmae2_cgnn_bot': 1 if user in our_bots else 0,
                    'gmae2_cgnn_score': self.bot_scores.get(user, 0)
                })
        
        comparison_df = pd.DataFrame(comparison_data)
        comparison_csv_path = os.path.join(self.output_dir, "method_comparison.csv")
        comparison_df.to_csv(comparison_csv_path, index=False)
        
        return {
            'activity_bots': activity_bots,
            'temporal_bots': temporal_bots,
            'sentiment_bots': sentiment_bots,
            'our_bots': our_bots,
            'comparison_df': comparison_df
        }
    
    def visualize_bot_network(self, max_nodes=1000):
        """Visualize the bot subgraph structure"""
        self.log("Visualizing bot network structure...")
        
        if not self.graph or not self.bot_scores:
            self.log("No graph or bot scores available")
            return
        
        # Get bot nodes
        bot_users = [user for user, prob in self.bot_scores.items() if prob >= 0.5]
        bot_nodes = [self.user_to_node[user] for user in bot_users if user in self.user_to_node]
        
        # Sample if too many
        if len(bot_nodes) > max_nodes:
            self.log(f"Sampling {max_nodes} nodes from {len(bot_nodes)} bot nodes for visualization")
            bot_nodes = random.sample(bot_nodes, max_nodes)
        
        # Extract bot subgraph
        bot_subgraph = self.graph.subgraph(bot_nodes)
        
        # Visualize
        plt.figure(figsize=(14, 12))
        
        # Get communities in the bot subgraph
        try:
            communities = nx.community.greedy_modularity_communities(bot_subgraph)
            self.log(f"Detected {len(communities)} communities in the bot network")
            
            # Assign colors to communities
            colors = plt.cm.rainbow(np.linspace(0, 1, len(communities)))
            node_colors = {}
            
            for i, community in enumerate(communities):
                for node in community:
                    node_colors[node] = colors[i]
            
            # Draw with community colors
            pos = nx.spring_layout(bot_subgraph, seed=42)
            
            # Draw nodes colored by community
            for node in bot_subgraph.nodes():
                nx.draw_networkx_nodes(
                    bot_subgraph, pos,
                    nodelist=[node],
                    node_color=[node_colors.get(node, 'gray')],
                    node_size=50,
                    alpha=0.8
                )
            
            # Draw edges
            nx.draw_networkx_edges(
                bot_subgraph, pos,
                edge_color='gray',
                width=0.5,
                alpha=0.5
            )
            
            plt.title(f'Bot Network Structure: {len(bot_nodes)} bots, {len(communities)} communities', fontsize=16)
            plt.axis('off')
            
        except Exception as e:
            self.log(f"Error detecting communities: {e}")
            
            # Fallback to basic visualization
            pos = nx.spring_layout(bot_subgraph, seed=42)
            nx.draw(bot_subgraph, pos, 
                    node_color='red',
                    node_size=30,
                    alpha=0.8,
                    edge_color='gray',
                    width=0.5)
            plt.title(f'Bot Network Structure (Sample of {len(bot_nodes)} bots)', fontsize=16)
            plt.axis('off')
        
        # Save visualization
        network_vis_path = os.path.join(self.output_dir, "bot_network.png")
        plt.savefig(network_vis_path, dpi=300, bbox_inches='tight')
        self.log(f"Bot network visualization saved to {network_vis_path}")
    
    def save_model(self, path=None):
        """Save the trained model for later use"""
        if path is None:
            path = os.path.join(self.output_dir, "gmae2_cgnn_model.pt")
            
        try:
            if not self.feature_encoder or not self.graph_encoder:
                self.log("No trained model to save")
                return False
                
            model_dict = {
                'feature_encoder': self.feature_encoder.state_dict(),
                'graph_encoder': self.graph_encoder.state_dict(),
                'user_to_node': self.user_to_node,
                'node_to_user': self.node_to_user,
                'bot_scores': self.bot_scores,
            }
            
            torch.save(model_dict, path)
            self.log(f"Model saved to {path}")
            return True
        except Exception as e:
            self.log(f"Error saving model: {e}")
            return False
    
    def load_model(self, path):
        """Load a trained model"""
        try:
            if not os.path.exists(path):
                self.log(f"Model file {path} not found")
                return False
            
            model_dict = torch.load(path, map_location=device)
            
            # Load user mappings
            self.user_to_node = model_dict['user_to_node']
            self.node_to_user = model_dict['node_to_user']
            self.bot_scores = model_dict['bot_scores']
            
            # Recreate models and load weights
            if 'feature_encoder' in model_dict:
                # Infer input dim from the first layer
                input_weight = next(iter(model_dict['feature_encoder'].items()))[1]
                input_dim = input_weight.shape[1]
                
                self.feature_encoder = FeatureEncoder(input_dim=input_dim).to(device)
                self.feature_encoder.load_state_dict(model_dict['feature_encoder'])
            
            if 'graph_encoder' in model_dict:
                # Get dimensions from the weights
                first_layer = list(model_dict['graph_encoder'].items())[0][1]
                input_dim = first_layer.shape[1]
                
                self.graph_encoder = GraphEncoder(input_dim=input_dim).to(device)
                self.graph_encoder.load_state_dict(model_dict['graph_encoder'])
            
            self.log(f"Model loaded from {path}")
            return True
        except Exception as e:
            self.log(f"Error loading model: {e}")
            return False
    
    def run_pipeline(self, df, sample_size=None):
        """Run the complete bot detection pipeline"""
        self.log("Running bot detection pipeline...")
        self.run_stats['start_time'] = datetime.now()
        
        # Sample data if needed
        if sample_size and len(df) > sample_size:
            self.log(f"Sampling {sample_size} rows from {len(df)} total rows")
            df_sample = df.sample(n=sample_size, random_state=SEED)
        else:
            df_sample = df
        
        # Process data
        processed_df = self.preprocess_data(df_sample)
        self.extract_features(processed_df)
        self.construct_graph(processed_df)
        self.train_model()
        self.generate_embeddings()
        bot_probs = self.detect_bots()
        
        # Generate research outputs
        self.visualize_results()
        self.analyze_feature_importance()
        self.compare_with_baselines(processed_df)
        self.visualize_bot_network()
        
        # Save model and results
        self.save_model()
        
        # Save results to CSV
        results_df = pd.DataFrame({
            'user': list(bot_probs.keys()),
            'bot_probability': list(bot_probs.values()),
            'is_bot': [1 if p >= 0.5 else 0 for p in bot_probs.values()]
        })
        
        csv_path = os.path.join(self.output_dir, "bot_detection_results.csv")
        results_df.to_csv(csv_path, index=False)
        self.log(f"Results saved to {csv_path}")
        
        # Update statistics
        num_bots = sum(1 for p in bot_probs.values() if p >= 0.5)
        total_users = len(bot_probs)
        bot_percentage = (num_bots / total_users) * 100 if total_users > 0 else 0
        
        self.run_stats['end_time'] = datetime.now()
        self.run_stats['total_users'] = total_users
        self.run_stats['detected_bots'] = num_bots
        self.run_stats['bot_percentage'] = bot_percentage
        
        # Save run statistics
        stats_path = os.path.join(self.output_dir, "run_statistics.json")
        with open(stats_path, 'w') as f:
            import json
            # Convert datetime to string
            stats_dict = self.run_stats.copy()
            stats_dict['start_time'] = str(stats_dict['start_time'])
            stats_dict['end_time'] = str(stats_dict['end_time'])
            json.dump(stats_dict, f, indent=2)
        
        return bot_probs

    def run_full_dataset_pipeline(self, df):
        """Run the bot detection pipeline on the full dataset using batching with proper deduplication"""
        self.log("Running bot detection pipeline on full dataset...")
        self.run_stats['start_time'] = datetime.now()
        
        # First, preprocess all data
        self.log("Preprocessing full dataset...")
        # Ensure datetime format for created_at
        if not pd.api.types.is_datetime64_dtype(df['created_at']):
            df['created_at'] = pd.to_datetime(df['created_at'], format='mixed', utc=True, errors='coerce')
        
        # Add neutral column if not present
        if 'neutral' not in df.columns:
            df['neutral'] = ((df['happy'] == 0) & (df['sad'] == 0)).astype(int)
        
        # Get all users with at least min_posts
        min_posts = 2  # The minimum post threshold from original code
        user_counts = df['author_handle'].value_counts()
        qualified_users = user_counts[user_counts >= min_posts].index.tolist()
        total_qualified_users = len(qualified_users)
        self.log(f"Total qualified users with at least {min_posts} posts: {total_qualified_users}")
        
        # Track processed users to avoid duplicates
        processed_users = set()
        all_bot_probs = {}
        
        # Process in batches
        batch_size = 5000  # As used in the original code
        num_batches = (total_qualified_users + batch_size - 1) // batch_size
        
        for batch_idx in range(num_batches):
            self.log(f"Processing batch {batch_idx+1}/{num_batches}")
            
            # Get users for this batch that haven't been processed yet
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, total_qualified_users)
            batch_candidates = qualified_users[start_idx:end_idx]
            
            # Filter out already processed users
            batch_users = [user for user in batch_candidates if user not in processed_users]
            
            if not batch_users:
                self.log(f"Skipping batch {batch_idx+1} - all users already processed")
                continue
                
            self.log(f"Processing {len(batch_users)} new users in batch {batch_idx+1}")
            
            # Extract data for these users
            batch_df = df[df['author_handle'].isin(batch_users)]
            
            # Reset state for new batch
            self.user_to_node = {}
            self.node_to_user = {}
            self.graph = None
            self.adj_matrix = None
            self.user_embeddings = None
            self.bot_scores = None
            
            # Run the pipeline on this batch
            try:
                self.preprocess_data(batch_df, max_users=len(batch_users), min_posts=min_posts)
                self.extract_features(batch_df)
                self.construct_graph(batch_df)
                
                # Skip if graph couldn't be constructed properly
                if self.graph is None or self.graph.number_of_nodes() < 2:
                    self.log(f"Skipping batch {batch_idx+1} due to insufficient graph structure")
                    continue
                    
                self.train_model()
                self.generate_embeddings()
                batch_bot_probs = self.detect_bots()
                
                # Mark these users as processed
                batch_processed_users = set(batch_bot_probs.keys())
                processed_users.update(batch_processed_users)
                
                # Sanity check - detect any potential duplicates
                duplicate_count = len(set(all_bot_probs.keys()) & batch_processed_users)
                if duplicate_count > 0:
                    self.log(f"Warning: Found {duplicate_count} duplicate users - these should have been filtered!")
                
                # Store results without duplicates
                for user, prob in batch_bot_probs.items():
                    if user not in all_bot_probs:  # Ensure no duplicates
                        all_bot_probs[user] = prob
                
                # Save interim results for this batch
                interim_df = pd.DataFrame({
                    'user': list(batch_bot_probs.keys()),
                    'bot_probability': list(batch_bot_probs.values()),
                    'is_bot': [1 if p >= 0.5 else 0 for p in batch_bot_probs.values()]
                })
                
                interim_path = os.path.join(self.output_dir, f"batch_{batch_idx+1}_results.csv")
                interim_df.to_csv(interim_path, index=False)
                
                # Update batch statistics
                num_batch_bots = sum(1 for p in batch_bot_probs.values() if p >= 0.5)
                batch_stats = {
                    'batch': batch_idx + 1,
                    'users': len(batch_bot_probs),
                    'bots': num_batch_bots,
                    'bot_percentage': (num_batch_bots / len(batch_bot_probs)) * 100 if batch_bot_probs else 0
                }
                self.run_stats['batches'].append(batch_stats)
                
                # Log accurate running count
                self.log(f"Batch {batch_idx+1}: Processed {len(batch_processed_users)} users, unique total so far: {len(all_bot_probs)}")
                
            except Exception as e:
                self.log(f"Error processing batch {batch_idx+1}: {str(e)}")
                import traceback
                self.log(traceback.format_exc())
                continue
            
            # Clear memory
            gc.collect()
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        # Combine all results
        final_df = pd.DataFrame({
            'user': list(all_bot_probs.keys()),
            'bot_probability': list(all_bot_probs.values()),
            'is_bot': [1 if p >= 0.5 else 0 for p in all_bot_probs.values()]
        })
        
        # Calculate final statistics
        bot_count = sum(1 for p in all_bot_probs.values() if p >= 0.5)
        bot_percentage = (bot_count / len(all_bot_probs)) * 100 if all_bot_probs else 0
        
        self.log(f"Final results: Detected {bot_count} bots out of {len(all_bot_probs)} unique users ({bot_percentage:.1f}%)")
        
        # Save final results
        final_path = os.path.join(self.output_dir, "full_dataset_results.csv")
        final_df.to_csv(final_path, index=False)
        self.log(f"Final results saved to {final_path}")
        
        # Update statistics
        self.run_stats['end_time'] = datetime.now()
        self.run_stats['total_users'] = total_qualified_users
        self.run_stats['processed_users'] = len(all_bot_probs)
        self.run_stats['detected_bots'] = bot_count
        self.run_stats['bot_percentage'] = bot_percentage
        
        # Save run statistics
        stats_path = os.path.join(self.output_dir, "full_run_statistics.json")
        with open(stats_path, 'w') as f:
            import json
            # Convert datetime to string
            stats_dict = self.run_stats.copy()
            stats_dict['start_time'] = str(stats_dict['start_time'])
            stats_dict['end_time'] = str(stats_dict['end_time'])
            json.dump(stats_dict, f, indent=2)
        
        return all_bot_probs



def main():
    """Main function to run the bot detection pipeline on the full dataset"""
    
    print("Using CPU")
    print("Starting GMAE2-CGNN Bot Detection")
    
    try:
        # Create output directory
        output_dir = "gmae2_cgnn_results"
        os.makedirs(output_dir, exist_ok=True)
        
        # Load data
        print("Loading dataset...")
        df1 = pd.read_csv("Refugee_data_2023_bluesky_sentiment.csv")
        df2 = pd.read_csv("Refugee_data_2024_bluesky_sentiment.csv")
        df = pd.concat([df1, df2])
        
        # Remove unnamed column if exists
        if 'Unnamed: 0' in df.columns:
            df = df.drop('Unnamed: 0', axis=1)
        
        # Load keywords
        keywords_df = pd.read_csv("refugee_immigrant_keyword_list_v3.csv")
        keywords_df = keywords_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
        
        # Merge data
        merged_df = df.merge(keywords_df, left_on="keyword", right_on="term", how="left")
        
        # Convert to datetime and add neutral column
        merged_df['created_at'] = pd.to_datetime(merged_df['created_at'], format='mixed', utc=True)
        merged_df = merged_df[merged_df['created_at'] < "2025-01-01 18:18:04.312000+0000"]
        merged_df['neutral'] = ((merged_df['happy'] == 0) & (merged_df['sad'] == 0)).astype(int)
        
        print(f"Loaded dataset with {len(merged_df)} posts and {len(merged_df['author_handle'].unique())} unique users")
        
        # Initialize detector
        detector = BotDetector(verbose=True, output_dir=output_dir)
        
        # Choose between running on a sample or the full dataset
        sample_mode = False  # Set to False to run on the full dataset
        
        if sample_mode:
            # Run on a sample
            sample_size = 100000  # Adjust based on memory constraints
            bot_probs = detector.run_pipeline(merged_df, sample_size=sample_size)
        else:
            # Run on the full dataset using batching
            bot_probs = detector.run_full_dataset_pipeline(merged_df)
        
        # Results
        bot_users = sum(1 for p in bot_probs.values() if p >= 0.5)
        print(f"Detected {bot_users} bots out of {len(bot_probs)} users ({bot_users/len(bot_probs)*100:.1f}%)")
        print(f"Results saved to {output_dir}/")
        
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Using CPU
Starting GMAE2-CGNN Bot Detection
Loading dataset...
Loaded dataset with 1090909 posts and 304898 unique users
[01:37:40] Running bot detection pipeline on full dataset...
[01:37:40] Preprocessing full dataset...
[01:37:40] Total qualified users with at least 2 posts: 126091
[01:37:40] Processing batch 1/26
[01:37:40] Processing 5000 new users in batch 1
[01:37:40] Preprocessing data...
[01:38:28] Preprocessing complete. 196157 posts from 5000 users
[01:38:28] Extracting user features...


100%|███████████████████████████████████████| 5000/5000 [00:56<00:00, 88.26it/s]


[01:39:25] Extracted 18 features for 5000 users
[01:39:25] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [02:52<00:00,  3.44s/it]


[01:43:09] Graph constructed with 5000 nodes and 858799 edges
[01:43:13] Training model...
[01:43:14] Epoch 1/30, Loss: 0.697518
[01:43:15] Epoch 5/30, Loss: 0.694738
[01:43:16] Epoch 10/30, Loss: 0.693602
[01:43:17] Epoch 15/30, Loss: 0.693301
[01:43:19] Epoch 20/30, Loss: 0.693208
[01:43:20] Epoch 25/30, Loss: 0.693170
[01:43:21] Epoch 30/30, Loss: 0.693146
[01:43:21] Training completed
[01:43:21] Generating embeddings...
[01:43:21] Detecting bots...
[01:43:22] Detected 1907 bots out of 5000 users (38.1%)
[01:43:22] Batch 1: Processed 5000 users, unique total so far: 5000
[01:43:22] Processing batch 2/26
[01:43:22] Processing 5000 new users in batch 2
[01:43:22] Preprocessing data...
[01:43:35] Preprocessing complete. 92569 posts from 5000 users
[01:43:35] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:18<00:00, 272.79it/s]


[01:43:53] Extracted 18 features for 5000 users
[01:43:53] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [02:51<00:00,  3.43s/it]


[01:46:58] Graph constructed with 5000 nodes and 838526 edges
[01:47:03] Training model...
[01:47:03] Epoch 1/30, Loss: 0.697331
[01:47:04] Epoch 5/30, Loss: 0.694993
[01:47:05] Epoch 10/30, Loss: 0.693804
[01:47:07] Epoch 15/30, Loss: 0.693480
[01:47:08] Epoch 20/30, Loss: 0.693307
[01:47:09] Epoch 25/30, Loss: 0.693190
[01:47:11] Epoch 30/30, Loss: 0.693102
[01:47:11] Training completed
[01:47:11] Generating embeddings...
[01:47:11] Detecting bots...
[01:47:11] Detected 2485 bots out of 5000 users (49.7%)
[01:47:11] Batch 2: Processed 5000 users, unique total so far: 10000
[01:47:11] Processing batch 3/26
[01:47:11] Processing 5000 new users in batch 3
[01:47:11] Preprocessing data...
[01:47:20] Preprocessing complete. 61834 posts from 5000 users
[01:47:20] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:12<00:00, 410.26it/s]


[01:47:32] Extracted 18 features for 5000 users
[01:47:32] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [02:51<00:00,  3.44s/it]


[01:50:33] Graph constructed with 5000 nodes and 826132 edges
[01:50:37] Training model...
[01:50:38] Epoch 1/30, Loss: 0.693670
[01:50:39] Epoch 5/30, Loss: 0.693118
[01:50:40] Epoch 10/30, Loss: 0.692322
[01:50:41] Epoch 15/30, Loss: 0.691666
[01:50:43] Epoch 20/30, Loss: 0.691643
[01:50:44] Epoch 25/30, Loss: 0.691646
[01:50:45] Epoch 30/30, Loss: 0.691622
[01:50:45] Training completed
[01:50:45] Generating embeddings...
[01:50:45] Detecting bots...
[01:50:45] Detected 2708 bots out of 5000 users (54.2%)
[01:50:45] Batch 3: Processed 5000 users, unique total so far: 15000
[01:50:46] Processing batch 4/26
[01:50:46] Processing 5000 new users in batch 4
[01:50:46] Preprocessing data...
[01:50:52] Preprocessing complete. 47186 posts from 5000 users
[01:50:52] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:10<00:00, 481.41it/s]


[01:51:03] Extracted 18 features for 5000 users
[01:51:03] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [02:25<00:00,  2.90s/it]


[01:53:35] Graph constructed with 5000 nodes and 802948 edges
[01:53:39] Training model...
[01:53:40] Epoch 1/30, Loss: 0.693078
[01:53:41] Epoch 5/30, Loss: 0.692036
[01:53:42] Epoch 10/30, Loss: 0.691096
[01:53:43] Epoch 15/30, Loss: 0.690954
[01:53:45] Epoch 20/30, Loss: 0.690995
[01:53:46] Epoch 25/30, Loss: 0.691014
[01:53:47] Epoch 30/30, Loss: 0.691023
[01:53:47] Training completed
[01:53:47] Generating embeddings...
[01:53:47] Detecting bots...
[01:53:47] Detected 2164 bots out of 5000 users (43.3%)
[01:53:47] Batch 4: Processed 5000 users, unique total so far: 20000
[01:53:47] Processing batch 5/26
[01:53:47] Processing 5000 new users in batch 5
[01:53:47] Preprocessing data...
[01:53:53] Preprocessing complete. 38016 posts from 5000 users
[01:53:53] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 550.14it/s]


[01:54:02] Extracted 18 features for 5000 users
[01:54:02] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [01:35<00:00,  1.90s/it]


[01:55:43] Graph constructed with 5000 nodes and 770855 edges
[01:55:47] Training model...
[01:55:48] Epoch 1/30, Loss: 0.693405
[01:55:49] Epoch 5/30, Loss: 0.693002
[01:55:50] Epoch 10/30, Loss: 0.692333
[01:55:51] Epoch 15/30, Loss: 0.691308
[01:55:52] Epoch 20/30, Loss: 0.690354
[01:55:53] Epoch 25/30, Loss: 0.689868
[01:55:55] Epoch 30/30, Loss: 0.689850
[01:55:55] Training completed
[01:55:55] Generating embeddings...
[01:55:55] Detecting bots...
[01:55:55] Detected 1923 bots out of 5000 users (38.5%)
[01:55:55] Batch 5: Processed 5000 users, unique total so far: 25000
[01:55:55] Processing batch 6/26
[01:55:55] Processing 5000 new users in batch 6
[01:55:55] Preprocessing data...
[01:56:00] Preprocessing complete. 32281 posts from 5000 users
[01:56:00] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:08<00:00, 620.37it/s]


[01:56:08] Extracted 18 features for 5000 users
[01:56:08] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [01:09<00:00,  1.38s/it]


[01:57:22] Graph constructed with 5000 nodes and 731348 edges
[01:57:26] Training model...
[01:57:27] Epoch 1/30, Loss: 0.700591
[01:57:28] Epoch 5/30, Loss: 0.694534
[01:57:29] Epoch 10/30, Loss: 0.692867
[01:57:30] Epoch 15/30, Loss: 0.691929
[01:57:31] Epoch 20/30, Loss: 0.690633
[01:57:32] Epoch 25/30, Loss: 0.689908
[01:57:33] Epoch 30/30, Loss: 0.689212
[01:57:33] Training completed
[01:57:33] Generating embeddings...
[01:57:33] Detecting bots...
[01:57:33] Detected 1799 bots out of 5000 users (36.0%)
[01:57:33] Batch 6: Processed 5000 users, unique total so far: 30000
[01:57:34] Processing batch 7/26
[01:57:34] Processing 5000 new users in batch 7
[01:57:34] Preprocessing data...
[01:57:38] Preprocessing complete. 28007 posts from 5000 users
[01:57:38] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:07<00:00, 671.02it/s]


[01:57:45] Extracted 18 features for 5000 users
[01:57:45] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:52<00:00,  1.04s/it]


[01:58:42] Graph constructed with 5000 nodes and 706007 edges
[01:58:46] Training model...
[01:58:46] Epoch 1/30, Loss: 0.694730
[01:58:47] Epoch 5/30, Loss: 0.693023
[01:58:48] Epoch 10/30, Loss: 0.691279
[01:58:49] Epoch 15/30, Loss: 0.689756
[01:58:51] Epoch 20/30, Loss: 0.688326
[01:58:52] Epoch 25/30, Loss: 0.687435
[01:58:53] Epoch 30/30, Loss: 0.687151
[01:58:53] Training completed
[01:58:53] Generating embeddings...
[01:58:53] Detecting bots...
[01:58:53] Detected 1662 bots out of 5000 users (33.2%)
[01:58:53] Batch 7: Processed 5000 users, unique total so far: 35000
[01:58:53] Processing batch 8/26
[01:58:53] Processing 5000 new users in batch 8
[01:58:53] Preprocessing data...
[01:58:57] Preprocessing complete. 25000 posts from 5000 users
[01:58:57] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:07<00:00, 671.24it/s]


[01:59:05] Extracted 18 features for 5000 users
[01:59:05] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:41<00:00,  1.20it/s]


[01:59:50] Graph constructed with 5000 nodes and 647610 edges
[01:59:54] Training model...
[01:59:54] Epoch 1/30, Loss: 0.703104
[01:59:55] Epoch 5/30, Loss: 0.696099
[01:59:56] Epoch 10/30, Loss: 0.693721
[01:59:57] Epoch 15/30, Loss: 0.692886
[01:59:58] Epoch 20/30, Loss: 0.692045
[01:59:59] Epoch 25/30, Loss: 0.690762
[02:00:00] Epoch 30/30, Loss: 0.689105
[02:00:00] Training completed
[02:00:00] Generating embeddings...
[02:00:00] Detecting bots...
[02:00:01] Detected 1787 bots out of 5000 users (35.7%)
[02:00:01] Batch 8: Processed 5000 users, unique total so far: 40000
[02:00:01] Processing batch 9/26
[02:00:01] Processing 5000 new users in batch 9
[02:00:01] Preprocessing data...
[02:00:04] Preprocessing complete. 21329 posts from 5000 users
[02:00:04] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:06<00:00, 755.01it/s]


[02:00:11] Extracted 18 features for 5000 users
[02:00:11] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:30<00:00,  1.64it/s]


[02:00:45] Graph constructed with 5000 nodes and 756714 edges
[02:00:49] Training model...
[02:00:50] Epoch 1/30, Loss: 0.694798
[02:00:51] Epoch 5/30, Loss: 0.693202
[02:00:52] Epoch 10/30, Loss: 0.692305
[02:00:53] Epoch 15/30, Loss: 0.691356
[02:00:54] Epoch 20/30, Loss: 0.690673
[02:00:55] Epoch 25/30, Loss: 0.690419
[02:00:57] Epoch 30/30, Loss: 0.690188
[02:00:57] Training completed
[02:00:57] Generating embeddings...
[02:00:57] Detecting bots...
[02:00:57] Detected 973 bots out of 5000 users (19.5%)
[02:00:57] Batch 9: Processed 5000 users, unique total so far: 45000
[02:00:57] Processing batch 10/26
[02:00:57] Processing 5000 new users in batch 10
[02:00:57] Preprocessing data...
[02:01:00] Preprocessing complete. 20000 posts from 5000 users
[02:01:00] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:06<00:00, 779.45it/s]


[02:01:07] Extracted 18 features for 5000 users
[02:01:07] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:27<00:00,  1.85it/s]


[02:01:37] Graph constructed with 5000 nodes and 652418 edges
[02:01:41] Training model...
[02:01:41] Epoch 1/30, Loss: 0.693912
[02:01:42] Epoch 5/30, Loss: 0.692223
[02:01:43] Epoch 10/30, Loss: 0.689576
[02:01:44] Epoch 15/30, Loss: 0.686939
[02:01:45] Epoch 20/30, Loss: 0.685400
[02:01:46] Epoch 25/30, Loss: 0.684724
[02:01:47] Epoch 30/30, Loss: 0.684529
[02:01:47] Training completed
[02:01:47] Generating embeddings...
[02:01:47] Detecting bots...
[02:01:47] Detected 1472 bots out of 5000 users (29.4%)
[02:01:47] Batch 10: Processed 5000 users, unique total so far: 50000
[02:01:48] Processing batch 11/26
[02:01:48] Processing 5000 new users in batch 11
[02:01:48] Preprocessing data...
[02:01:51] Preprocessing complete. 19296 posts from 5000 users
[02:01:51] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:06<00:00, 784.12it/s]


[02:01:57] Extracted 18 features for 5000 users
[02:01:57] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:25<00:00,  1.97it/s]


[02:02:26] Graph constructed with 5000 nodes and 475154 edges
[02:02:29] Training model...
[02:02:29] Epoch 1/30, Loss: 0.693485
[02:02:30] Epoch 5/30, Loss: 0.690705
[02:02:30] Epoch 10/30, Loss: 0.686078
[02:02:31] Epoch 15/30, Loss: 0.681849
[02:02:32] Epoch 20/30, Loss: 0.678362
[02:02:33] Epoch 25/30, Loss: 0.677495
[02:02:34] Epoch 30/30, Loss: 0.677236
[02:02:34] Training completed
[02:02:34] Generating embeddings...
[02:02:34] Detecting bots...
[02:02:34] Detected 2396 bots out of 5000 users (47.9%)
[02:02:34] Batch 11: Processed 5000 users, unique total so far: 55000
[02:02:34] Processing batch 12/26
[02:02:34] Processing 5000 new users in batch 12
[02:02:34] Preprocessing data...
[02:02:37] Preprocessing complete. 15000 posts from 5000 users
[02:02:37] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:06<00:00, 829.20it/s]


[02:02:43] Extracted 18 features for 5000 users
[02:02:43] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:15<00:00,  3.21it/s]


[02:03:01] Graph constructed with 5000 nodes and 556813 edges
[02:03:04] Training model...
[02:03:04] Epoch 1/30, Loss: 0.701391
[02:03:05] Epoch 5/30, Loss: 0.694707
[02:03:06] Epoch 10/30, Loss: 0.692489
[02:03:07] Epoch 15/30, Loss: 0.691225
[02:03:08] Epoch 20/30, Loss: 0.689853
[02:03:09] Epoch 25/30, Loss: 0.688052
[02:03:10] Epoch 30/30, Loss: 0.686204
[02:03:10] Training completed
[02:03:10] Generating embeddings...
[02:03:10] Detecting bots...
[02:03:10] Detected 1150 bots out of 5000 users (23.0%)
[02:03:10] Batch 12: Processed 5000 users, unique total so far: 60000
[02:03:10] Processing batch 13/26
[02:03:10] Processing 5000 new users in batch 13
[02:03:10] Preprocessing data...
[02:03:13] Preprocessing complete. 15000 posts from 5000 users
[02:03:13] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:06<00:00, 830.59it/s]


[02:03:19] Extracted 18 features for 5000 users
[02:03:19] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:15<00:00,  3.24it/s]


[02:03:37] Graph constructed with 5000 nodes and 488142 edges
[02:03:40] Training model...
[02:03:40] Epoch 1/30, Loss: 0.691004
[02:03:41] Epoch 5/30, Loss: 0.686256
[02:03:42] Epoch 10/30, Loss: 0.680266
[02:03:43] Epoch 15/30, Loss: 0.676246
[02:03:44] Epoch 20/30, Loss: 0.673660
[02:03:44] Epoch 25/30, Loss: 0.672517
[02:03:45] Epoch 30/30, Loss: 0.671655
[02:03:45] Training completed
[02:03:45] Generating embeddings...
[02:03:45] Detecting bots...
[02:03:45] Detected 1759 bots out of 5000 users (35.2%)
[02:03:45] Batch 13: Processed 5000 users, unique total so far: 65000
[02:03:46] Processing batch 14/26
[02:03:46] Processing 5000 new users in batch 14
[02:03:46] Preprocessing data...
[02:03:48] Preprocessing complete. 15000 posts from 5000 users
[02:03:48] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:06<00:00, 831.56it/s]


[02:03:54] Extracted 18 features for 5000 users
[02:03:54] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:15<00:00,  3.18it/s]


[02:04:13] Graph constructed with 5000 nodes and 520920 edges
[02:04:16] Training model...
[02:04:16] Epoch 1/30, Loss: 0.695158
[02:04:17] Epoch 5/30, Loss: 0.693054
[02:04:18] Epoch 10/30, Loss: 0.690936
[02:04:19] Epoch 15/30, Loss: 0.687687
[02:04:20] Epoch 20/30, Loss: 0.683646
[02:04:20] Epoch 25/30, Loss: 0.679115
[02:04:21] Epoch 30/30, Loss: 0.676221
[02:04:21] Training completed
[02:04:21] Generating embeddings...
[02:04:21] Detecting bots...
[02:04:21] Detected 1383 bots out of 5000 users (27.7%)
[02:04:21] Batch 14: Processed 5000 users, unique total so far: 70000
[02:04:22] Processing batch 15/26
[02:04:22] Processing 5000 new users in batch 15
[02:04:22] Preprocessing data...
[02:04:24] Preprocessing complete. 15000 posts from 5000 users
[02:04:24] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:06<00:00, 829.03it/s]


[02:04:30] Extracted 18 features for 5000 users
[02:04:30] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [11:14<00:00, 13.49s/it]


[02:15:48] Graph constructed with 5000 nodes and 562108 edges
[02:15:51] Training model...
[02:15:51] Epoch 1/30, Loss: 0.693530
[02:15:52] Epoch 5/30, Loss: 0.692174
[02:15:53] Epoch 10/30, Loss: 0.690168
[02:15:54] Epoch 15/30, Loss: 0.687780
[02:15:55] Epoch 20/30, Loss: 0.685294
[02:15:56] Epoch 25/30, Loss: 0.683066
[02:15:57] Epoch 30/30, Loss: 0.681695
[02:15:57] Training completed
[02:15:57] Generating embeddings...
[02:15:57] Detecting bots...
[02:15:57] Detected 1031 bots out of 5000 users (20.6%)
[02:15:57] Batch 15: Processed 5000 users, unique total so far: 75000
[02:15:58] Processing batch 16/26
[02:15:58] Processing 5000 new users in batch 16
[02:15:58] Preprocessing data...
[02:16:00] Preprocessing complete. 11706 posts from 5000 users
[02:16:00] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 948.06it/s]


[02:16:05] Extracted 18 features for 5000 users
[02:16:05] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:09<00:00,  5.21it/s]


[02:16:18] Graph constructed with 5000 nodes and 608153 edges
[02:16:21] Training model...
[02:16:21] Epoch 1/30, Loss: 0.693189
[02:16:22] Epoch 5/30, Loss: 0.692580
[02:16:23] Epoch 10/30, Loss: 0.690958
[02:16:24] Epoch 15/30, Loss: 0.689513
[02:16:25] Epoch 20/30, Loss: 0.688434
[02:16:26] Epoch 25/30, Loss: 0.687555
[02:16:27] Epoch 30/30, Loss: 0.686878
[02:16:27] Training completed
[02:16:27] Generating embeddings...
[02:16:27] Detecting bots...
[02:16:27] Detected 940 bots out of 5000 users (18.8%)
[02:16:27] Batch 16: Processed 5000 users, unique total so far: 80000
[02:16:27] Processing batch 17/26
[02:16:27] Processing 5000 new users in batch 17
[02:16:27] Preprocessing data...
[02:16:30] Preprocessing complete. 10000 posts from 5000 users
[02:16:30] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 967.61it/s]


[02:16:35] Extracted 18 features for 5000 users
[02:16:35] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:07<00:00,  6.89it/s]


[02:16:44] Graph constructed with 5000 nodes and 510165 edges
[02:16:47] Training model...
[02:16:47] Epoch 1/30, Loss: 0.692846
[02:16:48] Epoch 5/30, Loss: 0.691405
[02:16:49] Epoch 10/30, Loss: 0.689136
[02:16:50] Epoch 15/30, Loss: 0.687204
[02:16:51] Epoch 20/30, Loss: 0.685279
[02:16:52] Epoch 25/30, Loss: 0.683934
[02:16:52] Epoch 30/30, Loss: 0.682944
[02:16:52] Training completed
[02:16:52] Generating embeddings...
[02:16:52] Detecting bots...
[02:16:53] Detected 962 bots out of 5000 users (19.2%)
[02:16:53] Batch 17: Processed 5000 users, unique total so far: 85000
[02:16:53] Processing batch 18/26
[02:16:53] Processing 5000 new users in batch 18
[02:16:53] Preprocessing data...
[02:16:55] Preprocessing complete. 10000 posts from 5000 users
[02:16:55] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 943.29it/s]


[02:17:00] Extracted 18 features for 5000 users
[02:17:00] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:07<00:00,  6.86it/s]


[02:17:10] Graph constructed with 5000 nodes and 458034 edges
[02:17:12] Training model...
[02:17:12] Epoch 1/30, Loss: 0.694057
[02:17:13] Epoch 5/30, Loss: 0.692598
[02:17:14] Epoch 10/30, Loss: 0.691868
[02:17:15] Epoch 15/30, Loss: 0.690435
[02:17:16] Epoch 20/30, Loss: 0.688658
[02:17:16] Epoch 25/30, Loss: 0.686661
[02:17:17] Epoch 30/30, Loss: 0.684841
[02:17:17] Training completed
[02:17:17] Generating embeddings...
[02:17:17] Detecting bots...
[02:17:17] Detected 1002 bots out of 5000 users (20.0%)
[02:17:17] Batch 18: Processed 5000 users, unique total so far: 90000
[02:17:18] Processing batch 19/26
[02:17:18] Processing 5000 new users in batch 19
[02:17:18] Preprocessing data...
[02:17:20] Preprocessing complete. 10000 posts from 5000 users
[02:17:20] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 956.87it/s]


[02:17:25] Extracted 18 features for 5000 users
[02:17:25] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:07<00:00,  6.86it/s]


[02:17:35] Graph constructed with 5000 nodes and 419209 edges
[02:17:37] Training model...
[02:17:37] Epoch 1/30, Loss: 0.700030
[02:17:38] Epoch 5/30, Loss: 0.693391
[02:17:38] Epoch 10/30, Loss: 0.690883
[02:17:39] Epoch 15/30, Loss: 0.689312
[02:17:40] Epoch 20/30, Loss: 0.687646
[02:17:40] Epoch 25/30, Loss: 0.685414
[02:17:41] Epoch 30/30, Loss: 0.683680
[02:17:41] Training completed
[02:17:41] Generating embeddings...
[02:17:41] Detecting bots...
[02:17:41] Detected 1184 bots out of 5000 users (23.7%)
[02:17:41] Batch 19: Processed 5000 users, unique total so far: 95000
[02:17:41] Processing batch 20/26
[02:17:41] Processing 5000 new users in batch 20
[02:17:41] Preprocessing data...
[02:17:43] Preprocessing complete. 10000 posts from 5000 users
[02:17:43] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 959.61it/s]


[02:17:49] Extracted 18 features for 5000 users
[02:17:49] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:07<00:00,  6.72it/s]


[02:17:58] Graph constructed with 5000 nodes and 426944 edges
[02:18:01] Training model...
[02:18:01] Epoch 1/30, Loss: 0.694317
[02:18:02] Epoch 5/30, Loss: 0.691050
[02:18:03] Epoch 10/30, Loss: 0.684236
[02:18:03] Epoch 15/30, Loss: 0.679889
[02:18:04] Epoch 20/30, Loss: 0.675410
[02:18:05] Epoch 25/30, Loss: 0.672793
[02:18:06] Epoch 30/30, Loss: 0.671762
[02:18:06] Training completed
[02:18:06] Generating embeddings...
[02:18:06] Detecting bots...
[02:18:06] Detected 2319 bots out of 5000 users (46.4%)
[02:18:06] Batch 20: Processed 5000 users, unique total so far: 100000
[02:18:06] Processing batch 21/26
[02:18:06] Processing 5000 new users in batch 21
[02:18:06] Preprocessing data...
[02:18:08] Preprocessing complete. 10000 posts from 5000 users
[02:18:08] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 934.09it/s]


[02:18:14] Extracted 18 features for 5000 users
[02:18:14] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:07<00:00,  6.77it/s]


[02:18:23] Graph constructed with 5000 nodes and 543676 edges
[02:18:26] Training model...
[02:18:27] Epoch 1/30, Loss: 0.693387
[02:18:27] Epoch 5/30, Loss: 0.692203
[02:18:28] Epoch 10/30, Loss: 0.690452
[02:18:29] Epoch 15/30, Loss: 0.688615
[02:18:30] Epoch 20/30, Loss: 0.686639
[02:18:31] Epoch 25/30, Loss: 0.685339
[02:18:32] Epoch 30/30, Loss: 0.684431
[02:18:32] Training completed
[02:18:32] Generating embeddings...
[02:18:32] Detecting bots...
[02:18:32] Detected 1730 bots out of 5000 users (34.6%)
[02:18:32] Batch 21: Processed 5000 users, unique total so far: 105000
[02:18:32] Processing batch 22/26
[02:18:32] Processing 5000 new users in batch 22
[02:18:32] Preprocessing data...
[02:18:35] Preprocessing complete. 10000 posts from 5000 users
[02:18:35] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 934.27it/s]


[02:18:40] Extracted 18 features for 5000 users
[02:18:40] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:07<00:00,  6.74it/s]


[02:18:50] Graph constructed with 5000 nodes and 319994 edges
[02:18:51] Training model...
[02:18:51] Epoch 1/30, Loss: 0.695150
[02:18:52] Epoch 5/30, Loss: 0.690077
[02:18:52] Epoch 10/30, Loss: 0.686268
[02:18:53] Epoch 15/30, Loss: 0.681608
[02:18:54] Epoch 20/30, Loss: 0.676513
[02:18:54] Epoch 25/30, Loss: 0.670770
[02:18:55] Epoch 30/30, Loss: 0.665677
[02:18:55] Training completed
[02:18:55] Generating embeddings...
[02:18:55] Detecting bots...
[02:18:55] Detected 1832 bots out of 5000 users (36.6%)
[02:18:55] Batch 22: Processed 5000 users, unique total so far: 110000
[02:18:55] Processing batch 23/26
[02:18:55] Processing 5000 new users in batch 23
[02:18:55] Preprocessing data...
[02:18:57] Preprocessing complete. 10000 posts from 5000 users
[02:18:57] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 915.86it/s]


[02:19:03] Extracted 18 features for 5000 users
[02:19:03] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:07<00:00,  6.78it/s]


[02:19:12] Graph constructed with 5000 nodes and 273552 edges
[02:19:14] Training model...
[02:19:14] Epoch 1/30, Loss: 0.693134
[02:19:14] Epoch 5/30, Loss: 0.686800
[02:19:15] Epoch 10/30, Loss: 0.674761
[02:19:15] Epoch 15/30, Loss: 0.660888
[02:19:16] Epoch 20/30, Loss: 0.651011
[02:19:16] Epoch 25/30, Loss: 0.645604
[02:19:17] Epoch 30/30, Loss: 0.642327
[02:19:17] Training completed
[02:19:17] Generating embeddings...
[02:19:17] Detecting bots...
[02:19:17] Detected 2589 bots out of 5000 users (51.8%)
[02:19:17] Batch 23: Processed 5000 users, unique total so far: 115000
[02:19:17] Processing batch 24/26
[02:19:17] Processing 5000 new users in batch 24
[02:19:17] Preprocessing data...
[02:19:19] Preprocessing complete. 10000 posts from 5000 users
[02:19:19] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 943.26it/s]


[02:19:24] Extracted 18 features for 5000 users
[02:19:24] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:07<00:00,  6.61it/s]


[02:19:34] Graph constructed with 5000 nodes and 398192 edges
[02:19:36] Training model...
[02:19:37] Epoch 1/30, Loss: 0.693136
[02:19:37] Epoch 5/30, Loss: 0.691197
[02:19:38] Epoch 10/30, Loss: 0.687516
[02:19:38] Epoch 15/30, Loss: 0.682672
[02:19:39] Epoch 20/30, Loss: 0.677693
[02:19:40] Epoch 25/30, Loss: 0.672743
[02:19:40] Epoch 30/30, Loss: 0.668990
[02:19:40] Training completed
[02:19:40] Generating embeddings...
[02:19:40] Detecting bots...
[02:19:41] Detected 1460 bots out of 5000 users (29.2%)
[02:19:41] Batch 24: Processed 5000 users, unique total so far: 120000
[02:19:41] Processing batch 25/26
[02:19:41] Processing 5000 new users in batch 25
[02:19:41] Preprocessing data...
[02:19:43] Preprocessing complete. 10000 posts from 5000 users
[02:19:43] Extracting user features...


100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 943.01it/s]


[02:19:48] Extracted 18 features for 5000 users
[02:19:48] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 50/50 [00:07<00:00,  6.67it/s]


[02:19:58] Graph constructed with 5000 nodes and 446477 edges
[02:20:00] Training model...
[02:20:01] Epoch 1/30, Loss: 0.694775
[02:20:01] Epoch 5/30, Loss: 0.691670
[02:20:02] Epoch 10/30, Loss: 0.688658
[02:20:03] Epoch 15/30, Loss: 0.683930
[02:20:04] Epoch 20/30, Loss: 0.678539
[02:20:05] Epoch 25/30, Loss: 0.674032
[02:20:05] Epoch 30/30, Loss: 0.670983
[02:20:05] Training completed
[02:20:05] Generating embeddings...
[02:20:05] Detecting bots...
[02:20:06] Detected 1315 bots out of 5000 users (26.3%)
[02:20:06] Batch 25: Processed 5000 users, unique total so far: 125000
[02:20:06] Processing batch 26/26
[02:20:06] Processing 1091 new users in batch 26
[02:20:06] Preprocessing data...
[02:20:06] Preprocessing complete. 2182 posts from 1091 users
[02:20:06] Extracting user features...


100%|█████████████████████████████████████| 1091/1091 [00:00<00:00, 1141.45it/s]


[02:20:07] Extracted 18 features for 1091 users
[02:20:07] Constructing temporal interaction graph...


100%|███████████████████████████████████████████| 11/11 [00:01<00:00,  6.81it/s]


[02:20:09] Graph constructed with 1091 nodes and 98823 edges
[02:20:09] Training model...
[02:20:09] Epoch 1/30, Loss: 0.694547
[02:20:09] Epoch 5/30, Loss: 0.692142
[02:20:10] Epoch 10/30, Loss: 0.687809
[02:20:10] Epoch 15/30, Loss: 0.683881
[02:20:10] Epoch 20/30, Loss: 0.678636
[02:20:10] Epoch 25/30, Loss: 0.674679
[02:20:10] Epoch 30/30, Loss: 0.672915
[02:20:10] Training completed
[02:20:10] Generating embeddings...
[02:20:10] Detecting bots...
[02:20:10] Detected 270 bots out of 1091 users (24.7%)
[02:20:10] Batch 26: Processed 1091 users, unique total so far: 126091
[02:20:11] Final results: Detected 42202 bots out of 126091 unique users (33.5%)
[02:20:11] Final results saved to gmae2_cgnn_results/full_dataset_results.csv
Detected 42202 bots out of 126091 users (33.5%)
Results saved to gmae2_cgnn_results/


In [3]:
import pandas as pd

# Load data
df1 = pd.read_csv("Refugee_data_2023_bluesky_sentiment.csv")
df2 = pd.read_csv("Refugee_data_2024_bluesky_sentiment.csv")
data_df = pd.concat([df1,df2])
data_df = data_df.drop('Unnamed: 0', axis=1)

keywords_data_path = "refugee_immigrant_keyword_list_v3.csv"

keywords_df = pd.read_csv(keywords_data_path)
keywords_df = keywords_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# Merge posts data with keyword-subcategory mapping
merged_df = data_df.merge(keywords_df, left_on="keyword", right_on="term", how="left")

# Convert 'created_at' column to datetime format
merged_df['created_at'] = pd.to_datetime(merged_df['created_at'], format='mixed', utc=True)
merged_df = merged_df[merged_df['created_at']<"2025-01-01 18:18:04.312000+0000"]
# Add 'neutral' column: 1 if both happy and sad are 0, else 0
merged_df['neutral'] = ((merged_df['happy'] == 0) & (merged_df['sad'] == 0)).astype(int)

# Count posts per user
user_counts = merged_df['author_handle'].value_counts()

# Analyze
total_users = len(user_counts)
users_with_at_least_2_posts = sum(user_counts >= 2)
users_with_only_1_post = sum(user_counts == 1)

# Calculate percentages
percent_with_at_least_2 = (users_with_at_least_2_posts / total_users) * 100
percent_with_only_1 = (users_with_only_1_post / total_users) * 100

# Print results
print(f"Total unique users: {total_users}")
print(f"Users with at least 2 posts: {users_with_at_least_2_posts} ({percent_with_at_least_2:.2f}%)")
print(f"Users with only 1 post: {users_with_only_1_post} ({percent_with_only_1:.2f}%)")
print(f"Difference between total and processed users: {total_users - users_with_at_least_2_posts}")

# Check how filtering by date affects user counts
if 'created_at' in merged_df.columns:
    # Convert to datetime
    merged_df['created_at'] = pd.to_datetime(merged_df['created_at'], format='mixed', utc=True, errors='coerce')
    
    # Count users before date filtering
    before_filter_counts = merged_df['author_handle'].value_counts()
    users_before_filter = sum(before_filter_counts >= 2)
    
    # Apply date filter
    filtered_df = merged_df[merged_df['created_at'] < "2025-01-01 18:18:04.312000+0000"]
    
    # Count users after date filtering
    after_filter_counts = filtered_df['author_handle'].value_counts()
    users_after_filter = sum(after_filter_counts >= 2)
    
    print(f"\nUsers with at least 2 posts before date filtering: {users_before_filter}")
    print(f"Users with at least 2 posts after date filtering: {users_after_filter}")
    print(f"Users lost due to date filtering: {users_before_filter - users_after_filter}")

Total unique users: 304898
Users with at least 2 posts: 126091 (41.36%)
Users with only 1 post: 178807 (58.64%)
Difference between total and processed users: 178807

Users with at least 2 posts before date filtering: 126091
Users with at least 2 posts after date filtering: 126091
Users lost due to date filtering: 0
