In [1]:
# Install dependencies
!pip install textblob
!pip install vaderSentiment nltk nrclex
!pip install -U scikit-learn
!pip install tensorflow
!pip install nrclex
from google.colab import drive

# Import required libraries
import sys
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from textblob import TextBlob
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout, Flatten
from nltk.tokenize import word_tokenize
import nltk
from nrclex import NRCLex
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer # For VADER

# Mount Google Drive to access datasets
drive.mount('/content/drive', force_remount=True)
drive_libs_path = '/content/drive/MyDrive/libs'
sys.path.append(drive_libs_path)

# Additional imports based on request
from usage_examples import get_acceptance_indexes, CombinedAcceptanceIndexer
from noise_filter import NoiseFilter

# Download necessary NLTK data
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')



def load_and_filter_data(file_path):
    # Load dataset and randomly sample
    df = pd.read_csv(file_path, low_memory=False)

    # Ensure 'clean_title' column is lowercased
    df['clean_title'] = df['clean_title'].astype(str).str.lower()
    df['comments'] = df['comments'].astype(str)
    df['separated_comment'] = ''
    df['comments_orig'] = df['comments_orig'].astype(str)
    return df

def preprocess_data(df, method='textblob'):
    # Tokenize text
    df['tokenized_title'] = df['clean_title'].apply(word_tokenize)
    df['separated_comment'] = df['comments'].apply(lambda x: x.split('|__|'))
    # Calculate sentiment for 'clean_title'
    df['comments_polarity'], df['comments_subjectivity'] = zip(*df.apply(lambda row: process_comments(row, method=method), axis=1))
    df['clean_title_polarity'], df['clean_title_subjectivity'] = zip(*df['clean_title'].apply(lambda title: get_sentiment(title, method=method)))
    # Convert labels to integers
    label_encoder = LabelEncoder()
    df['encoded_labels'] = label_encoder.fit_transform(df['2_way_label'].astype(str))
    return df, label_encoder

def process_comments(row, method='textblob'):
    try:
        comment = row['comments_orig']
    except AttributeError:
        return 0, 0
    polarity, subjectivity = get_sentiment(str(comment), method=method)
    return polarity, subjectivity

def get_sentiment(text,method='textblob'):
    if method == 'textblob':
      blob = TextBlob(text)
      return blob.sentiment.polarity, blob.sentiment.subjectivity
    elif method == 'vader':
        analyzer = SentimentIntensityAnalyzer()
        scores = analyzer.polarity_scores(text)
        return scores['compound'], scores['compound']
    elif method == 'nrc':
        emotion_analyzer = NRCLex(text)
        sentiment_scores = emotion_analyzer.affect_frequencies
        polarity = sentiment_scores.get('positive', 0) - sentiment_scores.get('negative', 0)
        subjectivity = sum(sentiment_scores.values()) - sentiment_scores.get('anticipation', 0) - sentiment_scores.get('trust', 0)
        return polarity, subjectivity
    else:
        raise ValueError(f"Invalid method: {method}. Choose from 'textblob', 'vader', or 'nrc'.")


def build_cnn_bilstm_model(vocab_size, max_length):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
        Conv1D(filters=64, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Bidirectional(LSTM(64, return_sequences=True)),
        Flatten(),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')  # Use 'sigmoid' for binary classification
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def tokenize_and_pad(df, max_length, base_feature=False, method='textblob', tokenizer=None, pre_tokenized_titles=None):
    """
    Tokenizes text data, pads sequences to a fixed length, and combines features.

    Args:
        df (pd.DataFrame): The input DataFrame containing text and other features.
        max_length (int): The maximum sequence length for padding.
        base_feature (bool, optional): Whether to include base features. Defaults to False.
        method (str, optional): The sentiment analysis method. Defaults to 'textblob'.
        tokenizer (tf.keras.preprocessing.text.Tokenizer, optional): A pre-trained tokenizer. Defaults to None.
        pre_tokenized_titles (list, optional): Pre-tokenized titles. Defaults to None.

    Returns:
        tuple: A tuple containing the feature matrix (X) and the tokenizer.
    """
    if tokenizer is None:  # If tokenizer is not provided, create a new one
        tokenizer = tf.keras.preprocessing.text.Tokenizer()

        # Check if pre_tokenized_titles is provided
        if pre_tokenized_titles is not None:
            # Fit tokenizer on pre-tokenized titles and separated comments
            all_tokens = [token for title_tokens in pre_tokenized_titles for token in title_tokens] + \
                         df['separated_comment'].apply(lambda x: ' '.join(x)).tolist()
            tokenizer.fit_on_texts(all_tokens)
        else:
            # If pre_tokenized_titles is not provided, proceed as before
            tokenizer.fit_on_texts(df['clean_title'].tolist() + df['separated_comment'].apply(
                lambda x: ' '.join(x)).tolist())

    else:
        print("Using existing Tokenizer")  # Indicate when using the existing tokenizer

    # Use pre-tokenized titles if available, otherwise tokenize 'clean_title'
    if pre_tokenized_titles is not None:
        title_sequences = [[tokenizer.word_index.get(token, 0) for token in title_tokens]
                            for title_tokens in pre_tokenized_titles]
    else:
        title_sequences = tokenizer.texts_to_sequences(df['clean_title'])

    padded_title_sequences = tf.keras.preprocessing.sequence.pad_sequences(
        title_sequences, maxlen=max_length, padding='post')

    comment_sequences = tokenizer.texts_to_sequences(
        df['separated_comment'].apply(lambda x: ' '.join(x)))
    padded_comment_sequences = tf.keras.preprocessing.sequence.pad_sequences(
        comment_sequences, maxlen=max_length, padding='post')

    # Concatenate padded title and comment sequences for model input
    padded_sequences = np.concatenate(
        [padded_title_sequences, padded_comment_sequences], axis=1)

    # Always add sentiment features
    sentiment_features = df[[
        'comments_polarity', 'comments_subjectivity', 'clean_title_polarity', 'clean_title_subjectivity']].values

    # Add acceptance indexer features based on method if base_feature is False

    if not base_feature:
        if method == 'textblob':
            acceptance_indexer_features = df[[
                'TextBlobAcceptanceIndexer']].values  # Assuming TextBlobAcceptanceIndexer column exists
        elif method == 'nrc':
            acceptance_indexer_features = df[[
                'NRCAcceptanceIndexer']].values  # Assuming NRCAcceptanceIndexer column exists
        elif method == 'vader':
            acceptance_indexer_features = df[[
                'VADERAcceptanceIndexer']].values  # Assuming VADERAcceptanceIndexer column exists
        else:
            acceptance_indexer_features = np.empty(
                (len(df), 0))  # Empty array if method is not recognized
    else:
        acceptance_indexer_features = np.empty(
            (len(df), 0))  # Empty array if base_features is True

    # Combine text, sentiment, and acceptance indexer features
    X = np.concatenate([padded_sequences, sentiment_features,
                       acceptance_indexer_features], axis=1)

    return X, tokenizer

def main():
    # Specify dataset path
    file_path = "/content/drive/My Drive/datasets/data_with_indexers_applied_to_original_data_all_acceptance_indexers_v36.csv"

    # Load and preprocess data
    df = load_and_filter_data(file_path)
    df, label_encoder = preprocess_data(df,method='vader')

    # Prepare features and labels
    max_length = 100  # Max length for padding
    X, tokenizer = tokenize_and_pad(df, max_length,base_feature=False,method='nrc',pre_tokenized_titles=df['tokenized_title'].tolist()) # Get tokenizer
    y = df['encoded_labels']
    print("X shape:", X.shape)
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build CNN-BiLSTM model
    vocab_size = len(tokenizer.word_index) + 1
    model = build_cnn_bilstm_model(vocab_size, max_length)

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test)


    # Predict on the test data to get predictions for the confusion matrix
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int) # Convert probabilities to class labels (0 or 1)
    print(f"Test Accuracy: {accuracy:.2f}")
    print(f"Test Loss: {loss:.2f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred)) # Now y_test and y_pred have the same shape

if __name__ == "__main__":
    main()


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting nrclex
  Downloading NRCLex-4.0-py3-none-any.whl.metadata (3.2 kB)
INFO: pip is looking at multiple versions of nrclex to determine which version is compatible with other requirements. This could take a while.
  Downloading NRCLex-3.0.0.tar.gz (396 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m396.4/396.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: nrclex
  Building wheel for nrclex (setup.py) ... [?25l[?25hdone
  Created wheel for nrclex: filename=NRCLex-3.0.0-py3-none-any.whl size=43309 sha256=a554a1a412fbe705f7302d80c9b905e6900730e778391ba82c53fe0590830889
  Stor

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


X shape: (100, 205)




Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 694ms/step - accuracy: 0.6961 - loss: 0.6469 - val_accuracy: 0.9000 - val_loss: 0.3493
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 288ms/step - accuracy: 0.7688 - loss: 0.5361 - val_accuracy: 0.9000 - val_loss: 0.4237
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 316ms/step - accuracy: 0.7805 - loss: 0.4865 - val_accuracy: 0.9000 - val_loss: 0.3566
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 211ms/step - accuracy: 0.8039 - loss: 0.4387 - val_accuracy: 0.9000 - val_loss: 0.3723
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 171ms/step - accuracy: 0.7688 - loss: 0.4053 - val_accuracy: 0.9000 - val_loss: 0.3893
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 176ms/step - accuracy: 0.8180 - loss: 0.3064 - val_accuracy: 0.9000 - val_loss: 0.3276
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━