In [None]:
!pip install contractions
!pip install spacy

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K

In [None]:
import pandas as pd
import numpy as np
import re
import time
from datetime import timedelta
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
import contractions
import spacy
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Embedding, Conv1D, GlobalMaxPooling1D,
                                     Dense, Dropout, Input, Concatenate)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
import en_core_web_sm
import nltk
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Load necessary NLTK data
nltk.download('stopwords')

# Load spaCy English model
nlp = en_core_web_sm.load()

In [None]:
def expand_contractions(text):
    """
    Expands contractions in the given text.

    Args:
        text (str): The text containing contractions.

    Returns:
        str: Text with expanded contractions.
    """
    return contractions.fix(text)

def advanced_preprocess_text(text):
    """
    Preprocesses text by expanding contractions, removing mentions, URLs, hashtags,
    and applying tokenization, lemmatization, and stopword removal using spaCy.

    Args:
        text (str): The text to preprocess.

    Returns:
        str: Preprocessed text.
    """
    text = expand_contractions(text)
    # Remove user mentions
    text = re.sub(r'@\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|URL', '', text)
    text = re.sub(r'#', '', text)
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(tokens)

def load_and_preprocess_data(file_path):
    """
    Loads and preprocesses text data from a CSV file.

    Args:
        file_path (str): Path to the CSV file containing the text data.

    Returns:
        pd.DataFrame: DataFrame with the preprocessed text.
    """
    data = pd.read_csv(file_path)
    data['clean_text'] = data['text'].apply(advanced_preprocess_text)
    return data

def prepare_data_for_cnn(data, tokenizer, label_encoder):
    """
    Prepares data for input into a CNN by tokenizing and padding the text,
    and encoding the labels.

    Args:
        data (pd.DataFrame): Data containing the text and labels.
        tokenizer (Tokenizer): Tokenizer for converting text to sequences.
        label_encoder (LabelEncoder): Encoder for transforming labels.

    Returns:
        tuple: Tuple of tokenized and padded text (X) and encoded labels (y).
    """
    sequences = tokenizer.texts_to_sequences(data['clean_text'])
    X = pad_sequences(sequences, maxlen=max_len)
    y = label_encoder.transform(data['labels'])
    return X, y

def load_glove_embeddings(tokenizer, embedding_dim=100):
    """
    Loads pre-trained GloVe embeddings and creates an embedding matrix.

    Args:
        tokenizer (Tokenizer): Tokenizer with a word index.
        embedding_dim (int): Dimensionality of the embedding vectors.

    Returns:
        np.ndarray: Embedding matrix with GloVe embeddings for the words in the tokenizer's vocabulary.
    """
    embeddings_index = {}
    # Download GloVe embeddings
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove.6B.zip
    with open('glove.6B.100d.txt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = ''.join(values[:-100])  # Handle words with spaces
            coefs = np.asarray(values[-100:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if i >= max_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def create_multi_channel_cnn_model(vocab_size, embedding_matrix, embedding_dim=100):
    """
    Creates a multi-channel CNN model for text classification.

    Args:
        vocab_size (int): Vocabulary size.
        embedding_matrix (np.ndarray): Embedding matrix with pre-trained embeddings.
        embedding_dim (int): Dimensionality of the embedding vectors.

    Returns:
        Model: Compiled CNN model.
    """
    inputs = Input(shape=(max_len,))
    embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix],
                          input_length=max_len, trainable=False)(inputs)

    convs = []
    filter_sizes = [3, 4, 5]
    for size in filter_sizes:
        conv = Conv1D(128, kernel_size=size, activation='relu')(embedding)
        pool = GlobalMaxPooling1D()(conv)
        convs.append(pool)

    concat = Concatenate()(convs)
    dropout = Dropout(0.5)(concat)
    outputs = Dense(1, activation='sigmoid')(dropout)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_and_evaluate_cnn(X_train, y_train, X_test, y_test, vocab_size, experiment_name):
    """
    Trains and evaluates a multi-channel CNN model for text classification.

    Args:
        X_train (np.ndarray): Training data.
        y_train (np.ndarray): Training labels.
        X_test (np.ndarray): Test data.
        y_test (np.ndarray): Test labels.
        vocab_size (int): Vocabulary size for embedding.
        experiment_name (str): Name of the experiment for logging purposes.
    """
    print(f"\nRunning {experiment_name}")

    # Calculate class weights to handle class imbalance
    class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights = dict(enumerate(class_weights))

    model = create_multi_channel_cnn_model(vocab_size, embedding_matrix)

    early_stopping = EarlyStopping(monitor='val_loss', patience=3)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)

    start_time = time.time()
    history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2,
                        class_weight=class_weights, callbacks=[early_stopping, reduce_lr], verbose=1)

    total_elapsed_time = time.time() - start_time
    print(f"Training completed in {timedelta(seconds=total_elapsed_time)}")

    # Evaluate on test set
    y_pred_probs = model.predict(X_test).flatten()
    y_pred = (y_pred_probs > 0.5).astype(int)

    print("\nEvaluating model on test data:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"AUC-ROC: {roc_auc_score(y_test, y_pred_probs):.4f}")
    print(f"Classification report:\n{classification_report(y_test, y_pred)}")



In [None]:
# Load and preprocess datasets
olid_train = load_and_preprocess_data('olid-train-small.csv')
olid_test = load_and_preprocess_data('olid-test.csv')
hasoc_train = load_and_preprocess_data('hasoc-train.csv')
combined_train = pd.concat([olid_train, hasoc_train], ignore_index=True)

max_words = 20000
max_len = 150

# Fit tokenizer on combined training data
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(combined_train['clean_text'])

# Use a single LabelEncoder fitted on combined labels
le = LabelEncoder()
le.fit(combined_train['labels'])

# Prepare training and test data
X_olid_train, y_olid_train = prepare_data_for_cnn(olid_train, tokenizer, le)
X_hasoc_train, y_hasoc_train = prepare_data_for_cnn(hasoc_train, tokenizer, le)
X_test_olid, y_test_olid = prepare_data_for_cnn(olid_test, tokenizer, le)

embedding_dim = 100
embedding_matrix = load_glove_embeddings(tokenizer, embedding_dim)

# In-Domain Experiment (Train on OLIDv1 and Test on OLIDv1)
train_and_evaluate_cnn(X_olid_train, y_olid_train, X_test_olid, y_test_olid,
                       len(tokenizer.word_index) + 1, "In-Domain Experiment (Train on OLIDv1 and Test on OLIDv1)")

# Cross-Domain Experiment (Train on HASOC and Test on OLIDv1)
train_and_evaluate_cnn(X_hasoc_train, y_hasoc_train, X_test_olid, y_test_olid,
                       len(tokenizer.word_index) + 1, "Cross-Domain Experiment (Train on HASOC and Test on OLIDv1)")


In [None]:
import re
import numpy as np
import pandas as pd
import contractions
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

class Preprocessor:
    def __init__(self, max_words=20000, max_len=150):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.tfidf_vectorizer = TfidfVectorizer(max_features=5000)
        self.nlp = spacy.load('en_core_web_sm')
        self.max_words = max_words
        self.max_len = max_len
        self.tokenizer = Tokenizer(num_words=max_words)

    def expand_contractions(self, text):
        return contractions.fix(text)

    def advanced_preprocess_text(self, text):
        text = self.expand_contractions(text)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'http\S+|www\S+|URL', '', text)
        text = re.sub(r'#', '', text)
        doc = self.nlp(text.lower())
        tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        return ' '.join(tokens)

    def add_custom_features(self, data):
        data['text_length'] = data['text'].apply(len)
        data['special_char_count'] = data['text'].apply(lambda x: sum([1 for char in x if char in "!?."]))
        data['caps_count'] = data['text'].apply(lambda x: sum([1 for char in x if char.isupper()]))
        data['avg_word_length'] = data['text'].apply(lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0)
        data['unique_words_ratio'] = data['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)
        return data

    def preprocess_dataset(self, data, text_column='text', label_column='label', fit_tokenizer=True):
        """
        Preprocesses the entire dataset, including text preprocessing and feature engineering.

        Args:
            data (pd.DataFrame): Input DataFrame containing text and label columns.
            text_column (str): Name of the column containing the text data.
            label_column (str): Name of the column containing the labels.
            fit_tokenizer (bool): Whether to fit the tokenizer on the data.

        Returns:
            tuple: (X, y) where X is the preprocessed feature matrix and y is the label vector.
        """
        # Ensure the required columns exist
        assert text_column in data.columns, f"'{text_column}' column not found in the dataset"
        assert label_column in data.columns, f"'{label_column}' column not found in the dataset"

        # Preprocess text
        data['clean_text'] = data[text_column].apply(self.advanced_preprocess_text)

        # Add custom features
        data = self.add_custom_features(data)

        # Tokenize and pad sequences
        if fit_tokenizer:
            self.tokenizer.fit_on_texts(data['clean_text'])
        
        sequences = self.tokenizer.texts_to_sequences(data['clean_text'])
        X = pad_sequences(sequences, maxlen=self.max_len)

        # Extract labels
        y = data[label_column].values

        return X, y

    def get_custom_features(self, X):
        return X[['text_length', 'special_char_count', 'caps_count', 'avg_word_length', 'unique_words_ratio']].values

    def transform_new_data(self, new_data, text_column='text'):
        """
        Transforms new data using the fitted preprocessor.

        Args:
            new_data (pd.DataFrame): New data to transform, containing a text column.
            text_column (str): Name of the column containing the text data.

        Returns:
            np.array: Transformed feature matrix for the new data.
        """
        assert text_column in new_data.columns, f"'{text_column}' column not found in the dataset"

        # Preprocess text
        new_data['clean_text'] = new_data[text_column].apply(self.advanced_preprocess_text)

        # Add custom features
        new_data = self.add_custom_features(new_data)

        # Tokenize and pad sequences
        sequences = self.tokenizer.texts_to_sequences(new_data['clean_text'])
        X = pad_sequences(sequences, maxlen=self.max_len)

        return X

    def get_vocab_size(self):
        return len(self.tokenizer.word_index) + 1

# Example usage
if __name__ == "__main__":
    # Load your datasets
    olid_train = pd.read_csv('olid-train-small.csv')
    olid_test = pd.read_csv('olid-test.csv')
    hasoc_train = pd.read_csv('hasoc-train.csv')

    # Initialize the preprocessor
    preprocessor = Preprocessor()

    # Preprocess OLID-train-small dataset
    X_olid_train, y_olid_train = preprocessor.preprocess_dataset(olid_train, text_column='tweet', label_column='subtask_a')

    # Preprocess OLID-test dataset
    X_olid_test, y_olid_test = preprocessor.preprocess_dataset(olid_test, text_column='tweet', label_column='subtask_a', fit_tokenizer=False)

    # Preprocess HASOC-train dataset
    X_hasoc_train, y_hasoc_train = preprocessor.preprocess_dataset(hasoc_train, text_column='text', label_column='task_1', fit_tokenizer=False)

    print("OLID-train preprocessed shape:", X_olid_train.shape)
    print("OLID-test preprocessed shape:", X_olid_test.shape)
    print("HASOC-train preprocessed shape:", X_hasoc_train.shape)
    print("Vocabulary size:", preprocessor.get_vocab_size())