In [None]:
!pip install numpy pandas scikit-learn torch tensorflow


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

BiLSTM+RF

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPooling1D
from google.colab import drive
import random
import os

In [None]:
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/polarity_results (1).csv')
df = df.head(100_000)
df['title'] = df['title'].fillna('')

In [None]:
# Step 2: Feature selection
text_col = 'title'
tabular_cols = ['num_comments', 'score', 'upvote_ratio', 'polarity', 'emotion_score']
target_col = '2_way_label'

In [None]:
X = df[[text_col] + tabular_cols]
y = df[target_col]

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp)

In [None]:
# Step 3: Tokenize text
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train[text_col])
max_len = 40

def preprocess_text(text_series):
    seqs = tokenizer.texts_to_sequences(text_series)
    padded = pad_sequences(seqs, maxlen=max_len, padding='post', truncating='post')
    return padded

X_train_seq = preprocess_text(X_train[text_col])
X_val_seq = preprocess_text(X_val[text_col])
X_test_seq = preprocess_text(X_test[text_col])


In [None]:
def load_glove(path, dim):
    embeddings_index = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            if len(values) != dim + 1:
                continue  # Skip malformed lines
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index


glove_path = '/content/drive/MyDrive/glove.840B.300d.txt'
embedding_dim = 300
glove = load_glove(glove_path, embedding_dim)


In [None]:
# Get tokenizer word index
word_index = tokenizer.word_index
num_words = min(20000, len(word_index) + 1)  # limit to 20k words or fewer

# Create embedding matrix
embedding_dim = 300
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = glove.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPooling1D

input_seq = Input(shape=(max_len,))
x = Embedding(
    input_dim=num_words,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_len,
    trainable=False  # freeze GloVe weights
)(input_seq)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPooling1D()(x)

bilstm_model = Model(inputs=input_seq, outputs=x)
bilstm_model.save('/content/drive/MyDrive/bilstm_feature_extractor.h5')
# bilstm_model = load_model('/content/drive/MyDrive/bilstm_feature_extractor.h5')




In [None]:
# Run BILSTM model to extract embeddings
train_text_embed = bilstm_model.predict(X_train_seq, batch_size=256, verbose=1)
val_text_embed = bilstm_model.predict(X_val_seq, batch_size=256, verbose=1)
test_text_embed = bilstm_model.predict(X_test_seq, batch_size=256, verbose=1)


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 232ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 241ms/step
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 225ms/step


In [None]:
scaler = StandardScaler()
scaler.fit(X_train[tabular_cols])

train_tabular = scaler.transform(X_train[tabular_cols])
val_tabular = scaler.transform(X_val[tabular_cols])
test_tabular = scaler.transform(X_test[tabular_cols])


In [None]:
X_train_all = np.hstack([train_text_embed, train_tabular])
X_val_all = np.hstack([val_text_embed, val_tabular])
X_test_all = np.hstack([test_text_embed, test_tabular])

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf.fit(X_train_all, y_train)

# Validation Results
y_val_pred = clf.predict(X_val_all)
print("Validation Results:")
print(classification_report(y_val, y_val_pred))

val_probs = clf.predict_proba(X_val_all)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)
print(f"Validation AUC: {val_auc:.4f}")


Validation Results:
              precision    recall  f1-score   support

           0       0.91      0.79      0.85      8008
           1       0.82      0.93      0.87      7992

    accuracy                           0.86     16000
   macro avg       0.87      0.86      0.86     16000
weighted avg       0.87      0.86      0.86     16000

Validation AUC: 0.9419


In [None]:
y_test_pred = clf.predict(X_test_all)
print("Test Results:")
print(classification_report(y_test, y_test_pred))

test_probs = clf.predict_proba(X_test_all)[:, 1]
test_auc = roc_auc_score(y_test, test_probs)
print(f"Test AUC: {test_auc:.4f}")


Test Results:
              precision    recall  f1-score   support

           0       0.91      0.79      0.85     10009
           1       0.82      0.92      0.87      9991

    accuracy                           0.86     20000
   macro avg       0.86      0.86      0.86     20000
weighted avg       0.86      0.86      0.86     20000

Test AUC: 0.9402
