In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
import gensim.downloader as api
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr

from sklearn.pipeline import Pipeline

# 1. Load and Preprocess Data
def load_and_preprocess_data(filepath='your_dataset.csv'):
    """Loads the dataset and performs basic preprocessing."""
    data = pd.read_csv(filepath)
    data = data.dropna(subset=['text', 'label'])  # Drop rows with missing text or label
    data['text'] = data['text'].apply(lambda x: x.lower() if isinstance(x, str) else '')
    return data

# 2. Word2Vec Embeddings
def get_word2vec_embeddings(texts, model_name='word2vec-google-news-300', vector_size=300):
    """Generates Word2Vec embeddings for the given texts."""

    try:
        word_model = api.load(model_name)
    except ValueError as e:
        print(f"Error loading model: {e}. Please check the model name.")
        return None

    def embed_text(text):
        if isinstance(text, str):
            words = [word for word in text.split() if word in word_model]
            return np.mean(word_model[words], axis=0) if words else np.zeros(vector_size)
        else:
            return np.zeros(vector_size)

    embeddings = np.array([embed_text(text) for text in texts])
    return embeddings

# 3. LIWC Features (Placeholder)
def extract_liwc_features(texts):
    """Placeholder for LIWC feature extraction.

    Replace this with actual LIWC feature extraction code.
    """
    # Dummy LIWC feature extraction (replace with your actual implementation)
    def extract_features(text):
        if isinstance(text, str):
            return [len(text), text.count('i'), text.count('sad')]
        else:
            return [0, 0, 0]

    return np.array([extract_features(text) for text in texts])

# 4. Feature Selection
def select_features_by_correlation(X, y, threshold=0.4):
    """Selects features based on Pearson correlation with the target variable."""

    selected_features_indices = []
    for i in range(X.shape[1]):
        if not np.isnan(X[:, i]).any():  # Check for NaN values
            corr, _ = pearsonr(X[:, i], y)
            if not np.isnan(corr) and abs(corr) >= threshold:
                selected_features_indices.append(i)
        else:
            print(f"Warning: Feature {i} contains NaN values. Skipping.")
    if not selected_features_indices:
        return [i for i in range(X.shape[1])] # Select all if none is selected
    return selected_features_indices

# 5. Model Training and Evaluation
def train_and_evaluate_logistic_regression(X_train, y_train, X_test, y_test):
    """Trains and evaluates a Logistic Regression model."""

    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize features
        ('logreg', LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced')) # Add class weight
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1-Score:", f1_score(y_test, y_pred, average='weighted'))  # Weighted F1-score
    return pipeline

# Main Execution
if __name__ == '__main__':
    # Load and preprocess data
    data = load_and_preprocess_data('dreaddit.csv') # Replace with your dataset path
    X = data['text']
    y = data['label'] # Replace with your label column name


    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42) # Stratified split

    # Feature Extraction
    X_train_word2vec = get_word2vec_embeddings(X_train)
    X_test_word2vec = get_word2vec_embeddings(X_test)
    X_train_liwc = extract_liwc_features(X_train)
    X_test_liwc = extract_liwc_features(X_test)

    # Combine features
    X_train_combined = np.concatenate((X_train_word2vec, X_train_liwc), axis=1)
    X_test_combined = np.concatenate((X_test_word2vec, X_test_liwc), axis=1)

    # Feature Selection
    selected_features = select_features_by_correlation(X_train_combined, y_train)
    X_train_selected = X_train_combined[:, selected_features]
    X_test_selected = X_test_combined[:, selected_features]

    # Train and evaluate Logistic Regression
    print("Evaluation with feature selection:")
    train_and_evaluate_logistic_regression(X_train_selected, y_train, X_test_selected, y_test)

FileNotFoundError: [Errno 2] No such file or directory: 'dreaddit.csv'

In [7]:
import pandas as pd

try:
    train_data = pd.read_csv('dreaddit-train.csv', encoding='latin1')
    print("File loaded successfully with latin1 encoding.")
except Exception as e:
    print(f"Error: {e}")


File loaded successfully with latin1 encoding.


In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
import gensim.downloader as api
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
from sklearn.pipeline import Pipeline

# 1. Load Data from Separate Files
def load_data(train_filepath='dreaddit_train.csv', test_filepath='dreaddit-test.csv'):
    """Loads the train and test datasets from CSV files."""
    train_data = pd.read_csv(train_filepath, encoding='latin1')
    test_data = pd.read_csv(test_filepath, encoding='latin1')
    # Basic preprocessing (you might need more)
    train_data = train_data.dropna(subset=['text', 'label'])
    test_data = test_data.dropna(subset=['text', 'label'])

    train_data['text'] = train_data['text'].apply(lambda x: x.lower() if isinstance(x, str) else '')
    test_data['text'] = test_data['text'].apply(lambda x: x.lower() if isinstance(x, str) else '')

    return train_data, test_data

# 2. Word2Vec Embeddings
def get_word2vec_embeddings(texts, model_name='word2vec-google-news-300', vector_size=300):
    """Generates Word2Vec embeddings for the given texts."""

    try:
        word_model = api.load(model_name)
    except ValueError as e:
        print(f"Error loading model: {e}. Please check the model name.")
        return None

    def embed_text(text):
        if isinstance(text, str):
            words = [word for word in text.split() if word in word_model]
            return np.mean(word_model[words], axis=0) if words else np.zeros(vector_size)
        else:
            return np.zeros(vector_size)

    embeddings = np.array([embed_text(text) for text in texts])
    return embeddings

# 3. LIWC Features (Placeholder)
def extract_liwc_features(texts):
    """Placeholder for LIWC feature extraction.

    Replace this with actual LIWC feature extraction code.
    """
    # Dummy LIWC feature extraction (replace with your actual implementation)
    def extract_features(text):
        if isinstance(text, str):
            return [len(text), text.count('i'), text.count('sad')]
        else:
            return [0, 0, 0]

    return np.array([extract_features(text) for text in texts])

# 4. Feature Selection
def select_features_by_correlation(X, y, threshold=0.4):
    """Selects features based on Pearson correlation with the target variable."""

    selected_features_indices = []
    for i in range(X.shape[1]):
        if not np.isnan(X[:, i]).any():  # Check for NaN values
            corr, _ = pearsonr(X[:, i], y)
            if not np.isnan(corr) and abs(corr) >= threshold:
                selected_features_indices.append(i)
        else:
            print(f"Warning: Feature {i} contains NaN values. Skipping.")
    if not selected_features_indices:
        return [i for i in range(X.shape[1])] # Select all if none is selected
    return selected_features_indices

# 5. Model Training and Evaluation
def train_and_evaluate_logistic_regression(X_train, y_train, X_test, y_test):
    """Trains and evaluates a Logistic Regression model."""

    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize features
        ('logreg', LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced'))
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1-Score:", f1_score(y_test, y_pred, average='weighted'))
    return pipeline

# Main Execution
if __name__ == '__main__':
    # Load data
    train_data, test_data = load_data('dreaddit-train.csv', 'dreaddit-test.csv') # Specify filepaths
    X_train = train_data['text']
    y_train = train_data['label']
    X_test = test_data['text']
    y_test = test_data['label']

    # Feature Extraction
    X_train_word2vec = get_word2vec_embeddings(X_train)
    X_test_word2vec = get_word2vec_embeddings(X_test)
    X_train_liwc = extract_liwc_features(X_train)
    X_test_liwc = extract_liwc_features(X_test)

    # Combine features
    X_train_combined = np.concatenate((X_train_word2vec, X_train_liwc), axis=1)
    X_test_combined = np.concatenate((X_test_word2vec, X_test_liwc), axis=1)

    # Feature Selection
    selected_features = select_features_by_correlation(X_train_combined, y_train)
    X_train_selected = X_train_combined[:, selected_features]
    X_test_selected = X_test_combined[:, selected_features]

    # Train and evaluate Logistic Regression
    print("Evaluation with feature selection:")
    train_and_evaluate_logistic_regression(X_train_selected, y_train, X_test_selected, y_test)

[=====---------------------------------------------] 11.7% 193.8/1662.8MB downloaded

KeyboardInterrupt: 

In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
import gensim.downloader as api
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif # Using f_classif for feature selection
from scipy.stats import pearsonr
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# 1. Load Data from Separate Files
def load_data(train_filepath='dreaddit-train.csv', test_filepath='dreaddit-test.csv'):
    """Loads the train and test datasets from CSV files."""
    train_data = pd.read_csv(train_filepath, encoding='latin1')
    test_data = pd.read_csv(test_filepath, encoding='latin1')

    # Basic preprocessing (you might need more)
    train_data = train_data.dropna(subset=['text', 'label'])
    test_data = test_data.dropna(subset=['text', 'label'])

    train_data['text'] = train_data['text'].apply(lambda x: x.lower() if isinstance(x, str) else '')
    test_data['text'] = test_data['text'].apply(lambda x: x.lower() if isinstance(x, str) else '')

    return train_data, test_data

# 2. Text Cleaning and Preprocessing
def clean_text(text):
    """Cleans the text by removing noise, punctuation, and applying lemmatization."""

    if not isinstance(text, str):
        return ""

    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# 3. Word2Vec Embeddings
def get_word2vec_embeddings(texts, model_name='word2vec-google-news-300', vector_size=300):
    """Generates Word2Vec embeddings for the given texts."""
    try:
        word_model = api.load(model_name)
    except ValueError as e:
        print(f"Error loading model: {e}. Please check the model name.")
        return None

    def embed_text(text):
        if isinstance(text, str):
            words = [word for word in text.split() if word in word_model]
            return np.mean(word_model[words], axis=0) if words else np.zeros(vector_size)
        else:
            return np.zeros(vector_size)

    embeddings = np.array([embed_text(text) for text in texts])
    return embeddings

# 4. LIWC Features (Placeholder)
def extract_liwc_features(texts):
    """Placeholder for LIWC feature extraction.

    Replace this with actual LIWC feature extraction code.
    """
    # Dummy LIWC feature extraction (replace with your actual implementation)
    def extract_features(text):
        if isinstance(text, str):
            return [len(text), text.count('i'), text.count('sad')]
        else:
            return [0, 0, 0]

    return np.array([extract_features(text) for text in texts])

# 5. Feature Selection
def select_features_by_correlation(X, y, threshold=0.4):
    """Selects features based on Pearson correlation with the target variable."""
    selected_features_indices = []
    for i in range(X.shape[1]):
        if not np.isnan(X[:, i]).any():  # Check for NaN values
            corr, _ = pearsonr(X[:, i], y)
            if not np.isnan(corr) and abs(corr) >= threshold:
                selected_features_indices.append(i)
        else:
            print(f"Warning: Feature {i} contains NaN values. Skipping.")
    if not selected_features_indices:
        return [i for i in range(X.shape[1])]  # Select all if none is selected
    return selected_features_indices

# 6. Model Training and Evaluation
def train_and_evaluate_logistic_regression(X_train, y_train, X_test, y_test, feature_selection='correlation'):
    """Trains and evaluates a Logistic Regression model."""

    pipeline_steps = [
        ('scaler', StandardScaler())
    ]

    if feature_selection == 'correlation':
        selected_features = select_features_by_correlation(X_train, y_train)
        X_train = X_train[:, selected_features]
        X_test = X_test[:, selected_features]
    elif feature_selection == 'f_classif':
        pipeline_steps.append(('select_k_best', SelectKBest(score_func=f_classif, k='all')))  # You can adjust 'k'
    pipeline_steps.append(('logreg', LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced')))

    pipeline = Pipeline(pipeline_steps)

    # Hyperparameter tuning with GridSearchCV
    param_grid = {
        'logreg__C': [0.001, 0.01, 0.1, 1, 10],
        'logreg__penalty': ['l1', 'l2'],
        'logreg__solver': ['liblinear']
    }
    if feature_selection == 'f_classif':
        param_grid['select_k_best__k'] = ['all', 10, 20, 30]  # Adjust as needed

    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print("Best Hyperparameters:", grid_search.best_params_)
    y_pred = grid_search.predict(X_test)

    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1-Score:", f1_score(y_test, y_pred, average='weighted'))
    return grid_search.best_estimator_

# Main Execution
if __name__ == '__main__':
    # Load data
    train_data, test_data = load_data('dreaddit-train.csv', 'dreaddit-test.csv')  # Specify filepaths
    X_train_raw = train_data['text']
    y_train = train_data['label']
    X_test_raw = test_data['text']
    y_test = test_data['label']

    # Clean text
    X_train_cleaned = X_train_raw.apply(clean_text)
    X_test_cleaned = X_test_raw.apply(clean_text)

    # Feature Extraction
    X_train_word2vec = get_word2vec_embeddings(X_train_cleaned)
    X_test_word2vec = get_word2vec_embeddings(X_test_cleaned)
    X_train_liwc = extract_liwc_features(X_train_cleaned)
    X_test_liwc = extract_liwc_features(X_test_cleaned)

    # Combine features
    X_train_combined = np.concatenate((X_train_word2vec, X_train_liwc), axis=1)
    X_test_combined = np.concatenate((X_test_word2vec, X_test_liwc), axis=1)

    # Train and evaluate Logistic Regression with Correlation-based Feature Selection
    print("Evaluation with correlation-based feature selection:")
    train_and_evaluate_logistic_regression(X_train_combined.copy(), y_train.copy(), X_test_combined.copy(), y_test.copy(), feature_selection='correlation')

    # Train and evaluate Logistic Regression with f_classif Feature Selection
    print("\nEvaluation with f_classif feature selection:")
    train_and_evaluate_logistic_regression(X_train_combined.copy(), y_train.copy(), X_test_combined.copy(), y_test.copy(), feature_selection='f_classif')