In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
class SentimentAnalyser:
    def __init__(self):
        # Initialize necessary NLTK downloads
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')
            
        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')
            
        try:
            nltk.data.find('corpora/wordnet')
        except LookupError:
            nltk.download('wordnet')
            
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.vectorizer = None
        self.model = None
        
    def preprocess_text(self, text):
        """Clean and preprocess text data"""
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        
        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        processed_tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words and len(word) > 2]
        
        # Join tokens back into a string
        return ' '.join(processed_tokens)
    
    def prepare_data(self, file_path):
        """Load and prepare data from CSV file"""
        # Load data
        print("Loading dataset...")
        try:
            data = pd.read_csv(file_path)
            
            # Check if data has the right columns
            required_columns = ['text', 'sentiment']
            if not all(col in data.columns for col in required_columns):
                raise ValueError(f"Dataset must contain these columns: {required_columns}")
                
            # Preprocess text
            print("Preprocessing text data...")
            data['processed_text'] = data['text'].apply(self.preprocess_text)
            
            # Encode sentiment
            if data['sentiment'].dtype == object:
                print("Encoding sentiment labels...")
                sentiment_mapping = {
                    'positive': 2, 
                    'neutral': 1, 
                    'negative': 0
                }
                data['sentiment_encoded'] = data['sentiment'].map(sentiment_mapping)
            else:
                data['sentiment_encoded'] = data['sentiment']
                
            # Split data
            print("Splitting data into train and test sets...")
            X_train, X_test, y_train, y_test = train_test_split(
                data['processed_text'], 
                data['sentiment_encoded'],
                test_size=0.2,
                random_state=42
            )
            
            return X_train, X_test, y_train, y_test
            
        except Exception as e:
            print(f"Error preparing data: {e}")
            return None, None, None, None
            
    def train_model(self, X_train, y_train):
        """Train sentiment analysis model"""
        try:
            # Create TF-IDF vectorizer
            print("Creating TF-IDF features...")
            self.vectorizer = TfidfVectorizer(max_features=5000)
            X_train_tfidf = self.vectorizer.fit_transform(X_train)
            
            # Train logistic regression model
            print("Training logistic regression model...")
            self.model = LogisticRegression(C=1, max_iter=1000)
            self.model.fit(X_train_tfidf, y_train)
            
            print("Model training complete!")
            return True
            
        except Exception as e:
            print(f"Error training model: {e}")
            return False
            
    def evaluate_model(self, X_test, y_test):
        """Evaluate model performance"""
        try:
            # Transform test data
            X_test_tfidf = self.vectorizer.transform(X_test)
            
            # Make predictions
            y_pred = self.model.predict(X_test_tfidf)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            report = classification_report(y_test, y_pred)
            cm = confusion_matrix(y_test, y_pred)
            
            # Print results
            print(f"Model Accuracy: {accuracy:.4f}")
            print("\nClassification Report:")
            print(report)
            
            # Plot confusion matrix
            plt.figure(figsize=(8, 6))
            plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
            plt.title('Confusion Matrix')
            plt.colorbar()
            
            classes = ['Negative', 'Neutral', 'Positive']
            tick_marks = np.arange(len(classes))
            plt.xticks(tick_marks, classes, rotation=45)
            plt.yticks(tick_marks, classes)
            
            # Add text annotations to confusion matrix
            thresh = cm.max() / 2
            for i in range(cm.shape[0]):
                for j in range(cm.shape[1]):
                    plt.text(j, i, format(cm[i, j], 'd'),
                            horizontalalignment="center",
                            color="white" if cm[i, j] > thresh else "black")
            
            plt.ylabel('True label')
            plt.xlabel('Predicted label')
            plt.tight_layout()
            plt.show()
            
            return accuracy, report
            
        except Exception as e:
            print(f"Error evaluating model: {e}")
            return None, None
            
    def predict_sentiment(self, text):
        """Predict sentiment for new text"""
        if self.model is None or self.vectorizer is None:
            print("Model not trained. Please train the model first.")
            return None
            
        try:
            # Preprocess the text
            processed_text = self.preprocess_text(text)
            
            # Transform using vectorizer
            text_tfidf = self.vectorizer.transform([processed_text])
            
            # Predict
            prediction = self.model.predict(text_tfidf)[0]
            probabilities = self.model.predict_proba(text_tfidf)[0]
            
            # Map numeric prediction to sentiment
            sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
            sentiment = sentiment_map[prediction]
            
            # Create result dictionary
            result = {
                'text': text,
                'sentiment': sentiment,
                'confidence': {
                    'negative': round(probabilities[0], 4),
                    'neutral': round(probabilities[1], 4) if len(probabilities) > 1 else 0,
                    'positive': round(probabilities[2], 4) if len(probabilities) > 2 else 0
                }
            }
            
            return result
            
        except Exception as e:
            print(f"Error predicting sentiment: {e}")
            return None
            
    def save_model(self, model_path='sentiment_model.pkl', vectorizer_path='vectorizer.pkl'):
        """Save trained model and vectorizer to files"""
        if self.model is None or self.vectorizer is None:
            print("Model not trained. Nothing to save.")
            return False
            
        try:
            with open(model_path, 'wb') as f:
                pickle.dump(self.model, f)
                
            with open(vectorizer_path, 'wb') as f:
                pickle.dump(self.vectorizer, f)
                
            print(f"Model saved to {model_path} and vectorizer saved to {vectorizer_path}")
            return True
            
        except Exception as e:
            print(f"Error saving model: {e}")
            return False
            
    def load_model(self, model_path='sentiment_model.pkl', vectorizer_path='vectorizer.pkl'):
        """Load trained model and vectorizer from files"""
        try:
            with open(model_path, 'rb') as f:
                self.model = pickle.load(f)
                
            with open(vectorizer_path, 'rb') as f:
                self.vectorizer = pickle.load(f)
                
            print("Model and vectorizer loaded successfully!")
            return True
            
        except Exception as e:
            print(f"Error loading model: {e}")
            return False