In [None]:
# Install dependencies
!pip install textblob
!pip install -U scikit-learn
!pip install tensorflow
!pip install nrclex
from google.colab import drive

# Import required libraries
import sys
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from textblob import TextBlob
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout, Flatten
from nltk.tokenize import word_tokenize
import nltk

# Mount Google Drive to access datasets
drive.mount('/content/drive', force_remount=True)
drive_libs_path = '/content/drive/My Drive/TUS/Engineering_Project/data/Release/libs'
sys.path.append(drive_libs_path)

# Additional imports based on your request
from usage_examples import get_acceptance_indexes, CombinedAcceptanceIndexer
from noise_filter import NoiseFilter

# Download necessary NLTK datad
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')


withAcceptanceIndexFlag = True

def load_and_filter_data(file_path):
    """
    Load and filter data, focusing on text preprocessing and noise filtering.
    """
    # Load dataset and randomly sample
    if file_path.endswith('.tsv'):
        df = pd.read_csv(file_path, sep='\t', low_memory=False)
    elif file_path.endswith('.csv'):
        df = pd.read_csv(file_path, low_memory=False)
    else:
        raise ValueError("Unsupported file format. Please provide a CSV or TSV file.")
    # df.rename(columns={'comments_orig': 'comments'}, inplace=True)

    # Apply noise filters
    filter_config = {
        'MinLengths': {'clean_title': 10, 'comments': 10},
        'ExcludeImages': True
    }
    noise_filter = NoiseFilter(df, filter_config)
    filtered_df = noise_filter.apply()

    # Ensure 'clean_title' column is lowercased
    filtered_df['clean_title'] = filtered_df['clean_title'].astype(str).str.lower()
    filtered_df['comments'] = filtered_df['comments'].astype(str)
    return filtered_df

def preprocess_data(df):
    """
    Preprocess data for CNN-BiLSTM input, incorporating sentiment and acceptance index.
    """
    # Tokenize text
    df['tokenized_title'] = df['clean_title'].apply(word_tokenize)

    # Add sentiment analysis columns
    df['comments_polarity'], df['comments_subjectivity'] = zip(*df['comments'].apply(get_sentiment))
    df['clean_title_polarity'], df['clean_title_subjectivity'] = zip(*df['clean_title'].apply(get_sentiment))

    if withAcceptanceIndexFlag:
        # Add acceptance index features
        df['acceptance_index'] = df.apply(
            lambda row: calculate_acceptance_index(row['clean_title'], row['comments']),
            axis=1
        )
    else:
        # Remove acceptance_index column if it exists
        if 'acceptance_index' in df.columns:
            df.drop(columns=['acceptance_index'], inplace=True)


    # Convert labels to integers
    label_encoder = LabelEncoder()
    df['encoded_labels'] = label_encoder.fit_transform(df['2_way_label'].astype(str))
    return df, label_encoder

def get_sentiment(text):
    """
    Analyze sentiment using TextBlob.
    """
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def calculate_acceptance_index(title, comments):
    """
    Compute acceptance index using CombinedAcceptanceIndexer.
    """
    if pd.isna(comments) or comments == '':
        return 0
    comment_list = comments.split('|__|')
    indexer = CombinedAcceptanceIndexer(title, comment_list)
    return indexer.calculate_acceptance_index()

def build_cnn_bilstm_model(vocab_size, max_length):
    """
    Build a CNN-BiLSTM model using TensorFlow/Keras.
    """
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
        Conv1D(filters=64, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Bidirectional(LSTM(64, return_sequences=True)),
        Flatten(),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')  # Use 'sigmoid' for binary classification
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def tokenize_and_pad(df, max_length):
    """
    Tokenize and pad sequences for input to the model.
    """
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df['clean_title'])
    sequences = tokenizer.texts_to_sequences(df['clean_title'])
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences, tokenizer
def main():
    """
    Main function to orchestrate the process.
    """
    # Specify dataset path
    file_path = "/content/drive/MyDrive/TUS/Engineering_Project/data/Release/data_with_indexers_applied_to_original_data_all_acceptance_indexers_v36.tsv"

    # Load and preprocess data
    df = load_and_filter_data(file_path)
    df, label_encoder = preprocess_data(df)

    # Prepare features and labels
    max_length = 100  # Max length for padding
    X, tokenizer = tokenize_and_pad(df, max_length)
    y = df['encoded_labels']

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build CNN-BiLSTM model
    vocab_size = len(tokenizer.word_index) + 1
    model = build_cnn_bilstm_model(vocab_size, max_length)

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test)


    # Predict on the test data to get predictions for the confusion matrix
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int) # Convert probabilities to class labels (0 or 1)
    print(f"Test Accuracy: {accuracy:.2f}")
    print(f"Test Loss: {loss:.2f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred)) # Now y_test and y_pred have the same shape


    # Make predictions on new data
    new_title = "Shocking Discovery Leaves Internet Divided—Hoax or Breakthrough?"
    new_comments = [
        "This is outrageous! There's no way this can be true!",
        "I can't believe people are falling for this. It's just a hoax.",
        "Great news! This is exactly what we needed.",
        "Interesting perspective, I hadn't considered that before."
    ]
    new_sequences = tokenizer.texts_to_sequences([new_title]) # Tokenize the new title
    new_padded = tf.keras.preprocessing.sequence.pad_sequences(new_sequences, maxlen=max_length, padding='post')
    predictions = model.predict(new_padded)

    print("\nPredictions:")
    # Use the prediction for the new title
    print(f"Title: {new_title}")
    print(f"Prediction: {predictions[0][0]:.2f} (Confidence Level)")

if __name__ == "__main__":
    main()


Mounted at /content/drive


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Epoch 1/10




[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 130ms/step - accuracy: 0.6469 - loss: 0.6182 - val_accuracy: 0.8000 - val_loss: 0.5324
Epoch 2/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 85ms/step - accuracy: 0.7558 - loss: 0.5521 - val_accuracy: 0.8000 - val_loss: 0.4657
Epoch 3/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 85ms/step - accuracy: 0.8241 - loss: 0.3099 - val_accuracy: 0.7677 - val_loss: 0.7037
Epoch 4/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 99ms/step - accuracy: 0.9730 - loss: 0.0722 - val_accuracy: 0.6774 - val_loss: 0.9947
Epoch 5/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 146ms/step - accuracy: 1.0000 - loss: 0.0040 - val_accuracy: 0.7871 - val_loss: 0.8977
Epoch 6/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 88ms/step - accuracy: 1.0000 - loss: 0.0014 - val_accuracy: 0.7742 - val_loss: 1.0097
Epoch 7/10
[1m20/20[0m [32m━━━━━━━━━━━━



[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 182ms/step
Test Accuracy: 0.73
Test Loss: 1.18
Confusion Matrix:
[[14 17]
 [25 99]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step

Predictions:
Title: Shocking Discovery Leaves Internet Divided—Hoax or Breakthrough?
Prediction: 0.01 (Confidence Level)
