In [7]:
# Sentiment Analysis Model Development for Task Management System
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

# Create a directory for saving models if it doesn't exist
os.makedirs('models', exist_ok=True)

# Download necessary NLTK resources with proper error handling
def download_nltk_resources():
    resources = ['punkt', 'stopwords', 'wordnet']
    for resource in resources:
        try:
            nltk.download(resource, quiet=True)
            print(f"Successfully downloaded NLTK resource: {resource}")
        except Exception as e:
            print(f"Error downloading NLTK resource {resource}: {e}")
            print("Attempting to continue without this resource...")

download_nltk_resources()

# Define robust text preprocessing function
def preprocess_text(text):
    """
    Preprocesses the input text for sentiment analysis.
    
    Parameters:
    text (str): The input text to preprocess
    
    Returns:
    str: The preprocessed text
    """
    # Handle NaN, None, or non-string values
    if not isinstance(text, str) or pd.isna(text):
        return ''
    
    try:
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove user @ references and '#'
        text = re.sub(r'\@\w+|\#', '', text)
        
        # Remove punctuations and numbers
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        
        # Tokenize the text
        try:
            tokens = word_tokenize(text)
        except LookupError:
            # Fallback if NLTK resources are not available
            tokens = text.split()
        
        # Remove stopwords if available
        try:
            stop_words = set(stopwords.words('english'))
            tokens = [word for word in tokens if word not in stop_words]
        except LookupError:
            # Fallback if stopwords are not available
            print("Stopwords not available, skipping stopword removal")
        
        # Lemmatize if available
        try:
            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(word) for word in tokens]
        except LookupError:
            # Fallback if lemmatizer is not available
            print("WordNet lemmatizer not available, skipping lemmatization")
        
        # Join tokens back into string
        text = ' '.join(tokens)
        return text
    except Exception as e:
        print(f"Error preprocessing text: {e}")
        return text  # Return original text if preprocessing fails

def load_and_prepare_data(file_path):
    """
    Loads and prepares the dataset for sentiment analysis.
    
    Parameters:
    file_path (str): Path to the CSV file
    
    Returns:
    tuple: (X, y) preprocessed features and target variables
    """
    try:
        # Load dataset
        print(f"Loading dataset from {file_path}...")
        df = pd.read_csv(file_path)
        
        print("Dataset shape:", df.shape)
        print("\nSample data:")
        print(df.head())
        
        # For this specific dataset structure
        text_column = 'clean_text'
        sentiment_column = 'category'
        
        # Verify required columns exist
        if text_column not in df.columns:
            # Try to find a column that might contain text
            text_cols = [col for col in df.columns if any(x in col.lower() for x in ['text', 'content', 'message', 'tweet'])]
            if text_cols:
                print(f"'{text_column}' column not found. Using '{text_cols[0]}' as text column.")
                text_column = text_cols[0]
            else:
                raise ValueError("Could not find a text column in the dataset")
                
        # Rename for consistency
        df['text'] = df[text_column]
        
        # Verify sentiment column exists
        if sentiment_column not in df.columns:
            # Try to find a column that might contain sentiment
            sentiment_cols = [col for col in df.columns if any(x in col.lower() for x in ['sentiment', 'label', 'class', 'target', 'polarity', 'category'])]
            if sentiment_cols:
                print(f"'{sentiment_column}' column not found. Using '{sentiment_cols[0]}' as sentiment column.")
                sentiment_column = sentiment_cols[0]
            else:
                raise ValueError("Could not find a sentiment column in the dataset")
        
        # Rename for consistency
        df['sentiment'] = df[sentiment_column]
            
        # Drop rows with missing values in critical columns
        df = df.dropna(subset=['text', 'sentiment'])
        
        # Map sentiment values from the dataset's format (-1, 0, 1) to binary classification (0, 1)
        # Assuming -1 and 0 are negative, 1 is positive
        print("\nOriginal sentiment values:", df['sentiment'].unique())
        
        # Map values: consider only positive (1.0) as 1, all others (0.0, -1.0) as 0
        df['sentiment_binary'] = df['sentiment'].apply(lambda x: 1 if x == 1.0 else 0)
        
        # Verify the mapping worked
        print("Mapped sentiment values:", df['sentiment_binary'].unique())
        print("Sentiment value counts after mapping:")
        print(df['sentiment_binary'].value_counts())
        
        # Preprocess the text data
        print("\nPreprocessing text data...")
        df['processed_text'] = df['text'].apply(preprocess_text)
        
        # Remove rows where preprocessing failed (empty string)
        df = df[df['processed_text'].str.strip() != '']
        
        # Display data distribution
        print("\nFinal sentiment distribution:")
        print(df['sentiment_binary'].value_counts())
        
        return df['processed_text'], df['sentiment_binary']
    
    except Exception as e:
        print(f"Error loading and preparing data: {e}")
        raise

def train_sentiment_model(X, y):
    """
    Trains a sentiment analysis model.
    
    Parameters:
    X: Features (processed text)
    y: Target variable (sentiment)
    
    Returns:
    tuple: (model, vectorizer, evaluation_metrics)
    """
    try:
        # Split the dataset
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
        # Feature extraction using TF-IDF
        print("Extracting features using TF-IDF...")
        tfidf_vectorizer = TfidfVectorizer(max_features=5000)
        X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
        X_test_tfidf = tfidf_vectorizer.transform(X_test)
        
        # Train a logistic regression model
        print("Training logistic regression model...")
        model = LogisticRegression(max_iter=1000, random_state=42)
        model.fit(X_train_tfidf, y_train)
        
        # Evaluate the model
        y_pred = model.predict(X_test_tfidf)
        accuracy = accuracy_score(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        
        print("\nAccuracy:", accuracy)
        print("\nClassification Report:\n", class_report)
        
        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        plt.savefig('models/confusion_matrix.png')  # Save the figure
        plt.close()  # Close the figure to free memory
        
        return model, tfidf_vectorizer, {
            'accuracy': accuracy,
            'classification_report': class_report,
            'confusion_matrix': conf_matrix
        }
    
    except Exception as e:
        print(f"Error training sentiment model: {e}")
        raise

def create_task_optimization_model():
    """
    Creates a task optimization model that scores tasks
    based on sentiment, deadline proximity, and assigned priority.
    
    Returns:
    function: Task optimization function
    """
    
    def optimize_tasks(tasks_df):
        """
        Optimize and prioritize tasks based on multiple factors.
        
        Parameters:
        tasks_df (DataFrame): DataFrame containing task information
        
        Returns:
        DataFrame: Tasks sorted by optimization score
        """
        # Create a copy to avoid modifying the original DataFrame
        df = tasks_df.copy()
        
        # Calculate days to deadline
        if 'deadline' in df.columns:
            try:
                df['days_to_deadline'] = pd.to_datetime(df['deadline'], errors='coerce') - pd.Timestamp.now()
                df['days_to_deadline'] = df['days_to_deadline'].dt.total_seconds() / (24 * 3600)
                
                # Handle missing or invalid dates
                df['days_to_deadline'] = df['days_to_deadline'].fillna(30)  # Default to 30 days
                
                # Normalize deadline score (closer deadline = higher score)
                max_days = max(df['days_to_deadline'].max(), 30)  # At least 30 days to avoid division by zero
                df['deadline_score'] = 1 - (df['days_to_deadline'] / max_days).clip(0, 1)
            except Exception as e:
                print(f"Error calculating deadline score: {e}")
                df['deadline_score'] = 0.5  # Default score
        else:
            df['deadline_score'] = 0.5  # Default score
            
        # Convert priority to numeric score
        priority_map = {'urgent': 1.0, 'high': 0.75, 'medium': 0.5, 'low': 0.25}
        if 'priority' in df.columns:
            df['priority_score'] = df['priority'].str.lower().map(priority_map).fillna(0.5)
        else:
            df['priority_score'] = 0.5  # Default score
            
        # Add sentiment score if available
        if 'sentiment_score' not in df.columns:
            df['sentiment_score'] = 0.5  # Default score
            
        # Calculate final optimization score
        df['optimization_score'] = (
            df['deadline_score'] * 0.4 +  # Weight for deadline
            df['priority_score'] * 0.4 +  # Weight for priority
            df['sentiment_score'] * 0.2    # Weight for sentiment
        )
        
        # Sort tasks by optimization score
        return df.sort_values('optimization_score', ascending=False)
    
    return optimize_tasks

def create_predictive_analytics_model():
    """
    Creates a predictive analytics model for estimating task completion time.
    
    Returns:
    function: Task completion time prediction function
    """
    
    def predict_completion_time(task_history_df, new_task):
        """
        Predicts task completion time based on historical data
        
        Parameters:
        task_history_df: DataFrame with columns 'task_type', 'priority', 'assigned_to', 'actual_completion_time'
        new_task: dict with 'task_type', 'priority', 'assigned_to'
        
        Returns:
        float: Estimated hours to complete the task
        """
        if task_history_df is None or task_history_df.empty:
            # Default predictions if no history
            base_times = {'bug_fix': 4, 'feature': 8, 'documentation': 2, 'other': 6}
            priority_multipliers = {'urgent': 0.8, 'high': 0.9, 'medium': 1.0, 'low': 1.2}
            
            task_type = new_task.get('task_type', 'other').lower()
            priority = new_task.get('priority', 'medium').lower()
            
            base_time = base_times.get(task_type, 6)
            multiplier = priority_multipliers.get(priority, 1.0)
            
            return base_time * multiplier
        
        try:
            # Filter relevant history
            filtered_df = task_history_df.copy()
            
            # Handle potential missing columns
            required_columns = ['task_type', 'priority', 'assigned_to', 'actual_completion_time']
            for column in required_columns:
                if column not in filtered_df.columns:
                    print(f"Warning: '{column}' column not found in task history")
                    if column == 'actual_completion_time':
                        # This is critical - we can't continue without it
                        raise ValueError("'actual_completion_time' column is required for prediction")
            
            # Apply filters if corresponding columns exist
            if 'task_type' in filtered_df.columns and 'task_type' in new_task:
                filtered_df = filtered_df[filtered_df['task_type'].str.lower() == new_task['task_type'].lower()]
            
            if 'priority' in filtered_df.columns and 'priority' in new_task:
                filtered_df = filtered_df[filtered_df['priority'].str.lower() == new_task['priority'].lower()]
                
            if 'assigned_to' in filtered_df.columns and 'assigned_to' in new_task:
                filtered_df = filtered_df[filtered_df['assigned_to'] == new_task['assigned_to']]
            
            # Calculate average completion time
            if filtered_df.empty:
                # Fall back to overall average if no matching tasks
                return task_history_df['actual_completion_time'].mean()
            else:
                return filtered_df['actual_completion_time'].mean()
                
        except Exception as e:
            print(f"Error predicting completion time: {e}")
            # Return a reasonable default
            return 4.0  # Default 4 hours
    
    return predict_completion_time

def predict_sentiment(text, model, vectorizer):
    """
    Predicts sentiment of a given text.
    
    Parameters:
    text (str): The text to analyze
    model: Trained sentiment model
    vectorizer: TF-IDF vectorizer
    
    Returns:
    tuple: (sentiment, confidence)
    """
    try:
        processed = preprocess_text(text)
        if not processed:  # If preprocessing resulted in empty string
            return "Neutral", 0.5
            
        tfidf_vector = vectorizer.transform([processed])
        prediction = model.predict(tfidf_vector)[0]
        probability = model.predict_proba(tfidf_vector)[0]
        
        if prediction == 1:
            sentiment = "Positive"
            confidence = probability[1]
        else:
            sentiment = "Negative"
            confidence = probability[0]
            
        return sentiment, confidence
    
    except Exception as e:
        print(f"Error predicting sentiment: {e}")
        return "Neutral", 0.5

def save_models(model, vectorizer, task_optimizer, predictive_model):
    """
    Saves all models to disk.
    
    Parameters:
    model: Trained sentiment model
    vectorizer: TF-IDF vectorizer
    task_optimizer: Task optimization function
    predictive_model: Predictive analytics function
    """
    try:
        # Save the sentiment model
        with open('models/sentiment_model.pkl', 'wb') as f:
            pickle.dump(model, f)
        
        # Save the vectorizer
        with open('models/tfidf_vectorizer.pkl', 'wb') as f:
            pickle.dump(vectorizer, f)
        
        # Save the task optimizer
        with open('models/task_optimizer.pkl', 'wb') as f:
            pickle.dump(task_optimizer, f)
        
        # Save the predictive model
        with open('models/predictive_model.pkl', 'wb') as f:
            pickle.dump(predictive_model, f)
        
        print("\nAll models saved successfully!")
        
    except Exception as e:
        print(f"Error saving models: {e}")

def test_models(model, vectorizer):
    """
    Tests the sentiment model with sample task descriptions.
    
    Parameters:
    model: Trained sentiment model
    vectorizer: TF-IDF vectorizer
    """
    test_texts = [
        "This task is urgent and needs immediate attention",
        "I'm looking forward to working on this interesting project",
        "This is a low priority task that can be done anytime",
        "This task is frustrating and difficult to complete"
    ]

    print("\nTesting sentiment analysis on sample task descriptions:")
    for text in test_texts:
        sentiment, confidence = predict_sentiment(text, model, vectorizer)
        print(f"Text: '{text}'")
        print(f"Predicted sentiment: {sentiment} (confidence: {confidence:.2f})\n")

def main():
    """
    Main function to execute the sentiment analysis model development pipeline.
    """
    try:
        # Use the provided file path
        file_path = '/Users/krishnashetty/Desktop/task_manger/Twitter_Data.csv'
        print(f"Using dataset from: {file_path}")
        
        # Load and prepare data
        X, y = load_and_prepare_data(file_path)
        
        # Train the sentiment model
        model, vectorizer, metrics = train_sentiment_model(X, y)
        
        # Create task optimization and predictive models
        task_optimizer = create_task_optimization_model()
        predictive_model = create_predictive_analytics_model()
        
        # Save all models
        save_models(model, vectorizer, task_optimizer, predictive_model)
        
        # Test the sentiment model
        test_models(model, vectorizer)
        
        print("\nModel development completed successfully!")
        
    except Exception as e:
        print(f"Error in main execution: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Successfully downloaded NLTK resource: punkt
Successfully downloaded NLTK resource: stopwords
Successfully downloaded NLTK resource: wordnet
Using dataset from: /Users/krishnashetty/Desktop/task_manger/Twitter_Data.csv
Loading dataset from /Users/krishnashetty/Desktop/task_manger/Twitter_Data.csv...
Dataset shape: (162980, 2)

Sample data:
                                          clean_text  category
0  when modi promised “minimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0

Original sentiment values: [-1.  0.  1.]
Mapped sentiment values: [0 1]
Sentiment value counts after mapping:
sentiment_binary
0    90720
1    72249
Name: count, dtype: int64

Preprocessing text data...

Final sentiment distribution:
sentiment_binary
0    90651
1    72244
Name: 