In [8]:
!pip install textblob
!pip install -U scikit-learn
!pip install nrclex
from google.colab import drive

import sys
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.compose import ColumnTransformer
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer # Not used in this version
from nrclex import NRCLex
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon') # Not needed for TextBlob sentiment

drive.mount('/content/drive',force_remount=True)
# Add the 'libs' directory to the system path
drive_libs_path = '/content/drive/MyDrive/libs'
sys.path.append(drive_libs_path)

from usage_examples import get_acceptance_indexes, CombinedAcceptanceIndexer
from noise_filter import NoiseFilter

def mount_drive():
  drive.mount('/content/drive', force_remount=True)

def load_and_filter_data(file_path):
    df = pd.read_csv(file_path, low_memory=False)
    df.rename(columns={'comments_orig': 'comments'}, inplace=True)

    # Standard preprocessing\n",
    df['clean_title'] = df['clean_title'].astype(str).str.lower()
    df['comments'] = df['comments'].astype(str)
    df['separated_comment'] = ''

    return df

def get_sentiment(text,method='textblob'):
    if method == 'textblob':
        blob = TextBlob(text)
        return blob.sentiment.polarity, blob.sentiment.subjectivity
    elif method == 'vader':
        analyzer = SentimentIntensityAnalyzer()
        scores = analyzer.polarity_scores(text)
        return scores['compound'], scores['compound']
    elif method == 'nrc':
        emotion_analyzer = NRCLex(text)
        sentiment_scores = emotion_analyzer.affect_frequencies
        polarity = sentiment_scores.get('positive', 0) - sentiment_scores.get('negative', 0)
        subjectivity = sum(sentiment_scores.values()) - sentiment_scores.get('anticipation', 0) - sentiment_scores.get('trust', 0)
        return polarity, subjectivity
    else:
        raise ValueError(f"Invalid method: {method}. Choose from 'textblob', 'vader', or 'nrc'.")
def process_comments(row, method='textblob'):
    try:
        comment = row['comments']  # Get the 'comments' string directly
    except AttributeError:
        return 0, 0  # Handle cases where 'comments' column is missing or invalid

    # Call get_sentiment with the specified method
    polarity, subjectivity = get_sentiment(str(comment), method=method)

    return polarity, subjectivity

def enhanced_preprocess_data(df,method='textblob'):
    df['separated_comment'] = df['comments'].apply(lambda x: x.split('|__|'))
    df['comments_polarity'], df['comments_subjectivity'] = zip(*df.apply(lambda row: process_comments(row, method=method), axis=1))
    df['clean_title_polarity'], df['clean_title_subjectivity'] = zip(*df['clean_title'].apply(lambda title: get_sentiment(title, method=method)))
    return df

def prepare_features_and_labels(df,method='textblob',base_feature=False):
    base_features = ['clean_title', 'score', 'separated_comment', 'num_comments', 'upvote_ratio','comments_polarity', 'comments_subjectivity', 'clean_title_polarity',
            'clean_title_subjectivity']
    base_numerical_features = ['score', 'num_comments', 'upvote_ratio','comments_polarity', 'comments_subjectivity', 'clean_title_polarity',
            'clean_title_subjectivity']
    if base_feature:  # If base_feature is True, select only base features
        features = base_features
        numerical_features = base_numerical_features
    else:  # If base_feature is False, select base features + method-specific feature
        if method == 'textblob':
            features = base_features + ['TextBlobAcceptanceIndexer']
            numerical_features = base_numerical_features + ['TextBlobAcceptanceIndexer']
        elif method == 'nrc':
            features = base_features + ['NRCAcceptanceIndexer']
            numerical_features = base_numerical_features + ['NRCAcceptanceIndexer']
        elif method == 'vader':
            features = base_features + ['VADERAcceptanceIndexer']
            numerical_features = base_numerical_features + ['VADERAcceptanceIndexer']
        else:
            raise ValueError(f"Invalid method: {method}. Choose from 'textblob', 'vader', or 'nrc'.")

    X = df[features]

    X[numerical_features] = X[numerical_features].fillna(0)

    y = df['2_way_label'].astype(str)
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Ensure numerical features are of numeric type before clipping:
    X_train[numerical_features] = X_train[numerical_features].apply(pd.to_numeric, errors='coerce').fillna(0)
    X_test[numerical_features] = X_test[numerical_features].apply(pd.to_numeric, errors='coerce').fillna(0)

    # Now that X_train and X_test are defined, apply the clipping:
    X_train[numerical_features] = X_train[numerical_features].clip(lower=0)  # Clip values to be >= 0
    X_test[numerical_features] = X_test[numerical_features].clip(lower=0)  # Clip values to be >= 0


    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)

    # Combine text data from 'clean_title' and 'separated_comment'
    X_train['combined_text'] = X_train['clean_title'] + ' ' + X_train['separated_comment'].astype(str)
    X_test['combined_text'] = X_test['clean_title'] + ' ' + X_test['separated_comment'].astype(str)

    #Apply fit_transform to the combined text data
    X_train_text = tfidf_vectorizer.fit_transform(X_train['combined_text'])
    X_test_text = tfidf_vectorizer.transform(X_test['combined_text'])

    # Combine features

    X_train = np.concatenate([X_train_text.toarray(), X_train[numerical_features].values], axis=1)
    X_test = np.concatenate([X_test_text.toarray(), X_test[numerical_features].values], axis=1)


    return (X_train, X_test, y_train, y_test), label_encoder, tfidf_vectorizer

def enhanced_predict_new_comments(model, label_encoder, tfidf_vectorizer, comments,title,method='textblob', base_feature=False):
    # Define base features
    base_features_df = pd.DataFrame({'clean_title': title * len(comments),
                                       'score': [0] * len(comments),
                                       'num_comments': [0] * len(comments),
                                       'upvote_ratio': [0] * len(comments),
                                       'separated_comment': [''] * len(comments),
                                       'comments_polarity': [0] * len(comments),  # Add comments_polarity
                                       'comments_subjectivity': [0] * len(comments),  # Add comments_subjectivity
                                       'clean_title_polarity': [0] * len(comments),  # Add clean_title_polarity
                                       'clean_title_subjectivity': [0] * len(comments)})

    # Add method-specific features if base_feature is False
    if not base_feature:
        if method == 'textblob':
            base_features_df['TextBlobAcceptanceIndexer'] = [0] * len(comments)
        elif method == 'vader':
            base_features_df['VADERAcceptanceIndexer'] = [0] * len(comments)
        elif method == 'nrc':
            base_features_df['NRCAcceptanceIndexer'] = [0] * len(comments)
        else:
            raise ValueError(f"Invalid method: {method}. Choose from 'textblob', 'vader', or 'nrc'.")


    # Apply TF-IDF and One-Hot Encoding
    comments_text_transformed = tfidf_vectorizer.transform(base_features_df['clean_title'])

    # Select the appropriate indexer column based on method
    if method == 'textblob':
        indexer_column = 'TextBlobAcceptanceIndexer'
    elif method == 'vader':
        indexer_column = 'VADERAcceptanceIndexer'
    elif method == 'nrc':
        indexer_column = 'NRCAcceptanceIndexer'
    else:
        raise ValueError(f"Invalid method: {method}. Choose from 'textblob', 'vader', or 'nrc'.")

    # Combine features, using the selected indexer column
    if not base_feature:
        comments_transformed = np.concatenate([comments_text_transformed.toarray(),
                                              base_features_df[['score', 'num_comments', 'upvote_ratio','comments_polarity', 'comments_subjectivity', 'clean_title_polarity',
                                                                'clean_title_subjectivity',indexer_column]].values], axis=1)
    else:  # If base_feature is True, exclude the indexer column
        comments_transformed = np.concatenate([comments_text_transformed.toarray(),
                                              base_features_df[['score', 'num_comments', 'upvote_ratio','comments_polarity', 'comments_subjectivity', 'clean_title_polarity',
                                                                'clean_title_subjectivity']].values], axis=1)

    # Make predictions
    predictions = model.predict(comments_transformed)

    results = []
    for comment, pred in zip(comments, predictions):
        blob = TextBlob(comment)
        results.append({
            'comment': comment,
            'prediction': label_encoder.inverse_transform([pred])[0]
        })

    return results

def build_pipeline():
  model = Pipeline([
      ('classifier', MultinomialNB())
  ])
  return model

def evaluate_model(model, X_test, y_test, label_encoder):
  y_pred = model.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
  print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
  print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred)}")



def main():
    # Mount drive and load data
    mount_drive()
    file_path = "/content/drive/My Drive/datasets/data_with_indexers_applied_to_original_data_31_03_2025.csv"
    df = load_and_filter_data(file_path)

    # Enhanced preprocessing
    processed_df = enhanced_preprocess_data(df,method='nrc')

    # Prepare features and train model
    (X_train, X_test, y_train, y_test), label_encoder, tfidf_vectorizer = prepare_features_and_labels(processed_df,method='nrc',base_feature=False)
    model = build_pipeline()
    model.fit(X_train, y_train)

    # Evaluate
    evaluate_model(model, X_test, y_test, label_encoder)

    # Predict on new comments with sentiment analysis
    new_title = "Shocking Discovery Leaves Internet Divided—Hoax or Breakthrough?"
    new_comments = [
        "This is outrageous! There's no way this can be true!",
        "I can't believe people are falling for this. It's just a hoax.",
        "Great news! This is exactly what we needed.",
        "Interesting perspective, I hadn't considered that before."
    ]

    predictions = enhanced_predict_new_comments(model, label_encoder, tfidf_vectorizer, new_comments,new_title,method='nrc', base_feature=False)

    print("\nEnhanced Predictions with Sentiment Analysis:")
    for result in predictions:
        print(f"\nComment: {result['comment']}")
        print(f"Prediction: {result['prediction']}")

if __name__ == "__main__":
    main()



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Mounted at /content/drive
Mounted at /content/drive


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_features] = X[numerical_features].fillna(0)


              precision    recall  f1-score   support

           0       0.28      0.94      0.43      3317
           1       0.93      0.26      0.40     10842

    accuracy                           0.42     14159
   macro avg       0.60      0.60      0.42     14159
weighted avg       0.78      0.42      0.41     14159

Accuracy: 0.42
Confusion Matrix: [[3105  212]
 [8045 2797]]

Enhanced Predictions with Sentiment Analysis:

Comment: This is outrageous! There's no way this can be true!
Prediction: 1

Comment: I can't believe people are falling for this. It's just a hoax.
Prediction: 1

Comment: Great news! This is exactly what we needed.
Prediction: 1

Comment: Interesting perspective, I hadn't considered that before.
Prediction: 1
