In [1]:
!pip install textblob
!pip install -U scikit-learn
!pip install nrclex
from google.colab import drive

import sys
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.compose import ColumnTransformer
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer # Not used in this version
from nrclex import NRCLex
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon') # Not needed for TextBlob sentiment

drive.mount('/content/drive',force_remount=True)
# Add the 'libs' directory to the system path
drive_libs_path = '/content/drive/MyDrive/libs'
sys.path.append(drive_libs_path)

from usage_examples import get_acceptance_indexes, CombinedAcceptanceIndexer
from noise_filter import NoiseFilter

def mount_drive():
  drive.mount('/content/drive', force_remount=True)

def load_and_filter_data(file_path):
    df = pd.read_csv(file_path, low_memory=False,nrows=1000)

    # Apply noise filters (using example)
    filter_config = {
        'MinLengths': {
            'clean_title': 5,
            'comments': 5
        },
        'ExcludeImages': True
    }

    noise_filter = NoiseFilter(df, filter_config)
    filtered_df = noise_filter.apply() # Assuming 'apply' method exists in NoiseFilter

    # Standard preprocessing
    filtered_df['clean_title'] = filtered_df['clean_title'].astype(str).str.lower()
    filtered_df['comments'] = filtered_df['comments'].astype(str)
    filtered_df['separated_comment'] = ''

    return filtered_df

def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def enhanced_preprocess_data(df):
    df['comments_polarity'], df['comments_subjectivity'] = zip(*df['comments'].apply(get_sentiment))
    df['clean_title_polarity'], df['clean_title_subjectivity'] = zip(*df['clean_title'].apply(get_sentiment))
    df['separated_comment'] = df['comments'].apply(lambda x: x.split('|__|'))

    # Add acceptance index features (placeholder)
    df['acceptance_index'] = df.apply(
        lambda row: calculate_acceptance_index(row['clean_title'], row['comments']),
        axis=1
    )

    return df

def calculate_acceptance_index(title, comments):
    if pd.isna(comments) or comments == '':
        return 0

    comment_list = comments.split('|__|')
    indexer = CombinedAcceptanceIndexer(title, comment_list) # Using placeholder
    return indexer.calculate_acceptance_index() # Assuming this method exists


def prepare_features_and_labels(df):
    X = df[['clean_title', 'score', 'separated_comment', 'subreddit',
           'num_comments', 'upvote_ratio', 'comments_polarity',
           'comments_subjectivity', 'clean_title_polarity',
           'clean_title_subjectivity', 'acceptance_index']]

    numerical_features = ['score', 'num_comments', 'upvote_ratio',
                         'comments_polarity', 'comments_subjectivity',
                         'clean_title_polarity', 'clean_title_subjectivity',
                         'acceptance_index']

   # Impute missing numerical values with 0 and ensure numeric type
    for feature in numerical_features:
        if feature not in df.columns:
            df[feature] = 0
        else:
            # Convert to numeric, coerce errors to NaN, then fill NaN with 0
            df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)

    X[numerical_features] = X[numerical_features].fillna(0)

    y = df['2_way_label'].astype(str)
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Ensure numerical features are of numeric type before clipping:
    X_train[numerical_features] = X_train[numerical_features].apply(pd.to_numeric, errors='coerce').fillna(0)
    X_test[numerical_features] = X_test[numerical_features].apply(pd.to_numeric, errors='coerce').fillna(0)

    # Now that X_train and X_test are defined, apply the clipping:
    X_train[numerical_features] = X_train[numerical_features].clip(lower=0)  # Clip values to be >= 0
    X_test[numerical_features] = X_test[numerical_features].clip(lower=0)  # Clip values to be >= 0


    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_train_text = tfidf_vectorizer.fit_transform(X_train['clean_title'])
    X_test_text = tfidf_vectorizer.transform(X_test['clean_title'])

    # One-Hot Encoding for subreddit
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    X_train_cat = encoder.fit_transform(X_train[['subreddit']])
    X_test_cat = encoder.transform(X_test[['subreddit']])

    # Combine features
    X_train = np.concatenate([X_train_text.toarray(), X_train_cat, X_train[numerical_features].values], axis=1)
    X_test = np.concatenate([X_test_text.toarray(), X_test_cat, X_test[numerical_features].values], axis=1)


    return (X_train, X_test, y_train, y_test), label_encoder, tfidf_vectorizer, encoder
def enhanced_predict_new_comments(model, label_encoder, tfidf_vectorizer, encoder, comments,title):
    title_polarity, title_subjectivity = get_sentiment(title)
    df_comments = pd.DataFrame({
        'clean_title': [title] * len(comments),
        'score': [0] * len(comments),
        'subreddit': ['unknown'] * len(comments),
        'num_comments': [0] * len(comments),
        'upvote_ratio': [0] * len(comments),
        'separated_comment': [''] * len(comments),
        'comments_polarity': [TextBlob(c).sentiment.polarity for c in comments],
        'comments_subjectivity': [TextBlob(c).sentiment.subjectivity for c in comments],
        'clean_title_polarity': [title_polarity] * len(comments),
        'clean_title_subjectivity': [title_subjectivity] * len(comments),
        'acceptance_index': [0] * len(comments)
    })

    # Apply TF-IDF and One-Hot Encoding
    comments_text_transformed = tfidf_vectorizer.transform(df_comments['clean_title'])
    comments_cat_transformed = encoder.transform(df_comments[['subreddit']])

    # Combine features
    comments_transformed = np.concatenate([comments_text_transformed.toarray(),
                                          comments_cat_transformed,
                                          df_comments[['score', 'num_comments', 'upvote_ratio',
                                                      'comments_polarity', 'comments_subjectivity',
                                                      'clean_title_polarity', 'clean_title_subjectivity',
                                                      'acceptance_index']].values], axis=1)

    predictions = model.predict(comments_transformed)

    results = []
    for comment, pred in zip(comments, predictions):
        blob = TextBlob(comment)
        results.append({
            'comment': comment,
            'prediction': label_encoder.inverse_transform([pred])[0],
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        })

    return results

def build_pipeline():
  model = Pipeline([
      ('classifier', MultinomialNB())
  ])
  return model

def evaluate_model(model, X_test, y_test, label_encoder):
  y_pred = model.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
  print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
  print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred)}")



def main():
    # Mount drive and load data
    mount_drive()
    file_path = "/content/drive/My Drive/datasets/merged_cleaned_data_v31_news.csv"
    df = load_and_filter_data(file_path)

    # Enhanced preprocessing
    processed_df = enhanced_preprocess_data(df)

    # Prepare features and train model
    (X_train, X_test, y_train, y_test), label_encoder, tfidf_vectorizer, encoder = prepare_features_and_labels(processed_df)
    model = build_pipeline()
    model.fit(X_train, y_train)

    # Evaluate
    evaluate_model(model, X_test, y_test, label_encoder)

    # Predict on new comments with sentiment analysis
    new_title = "Shocking Discovery Leaves Internet Divided—Hoax or Breakthrough?"
    new_comments = [
        "This is outrageous! There's no way this can be true!",
        "I can't believe people are falling for this. It's just a hoax.",
        "Great news! This is exactly what we needed.",
        "Interesting perspective, I hadn't considered that before."
    ]

    predictions = enhanced_predict_new_comments(model, label_encoder, tfidf_vectorizer, encoder, new_comments,new_title)

    print("\nEnhanced Predictions with Sentiment Analysis:")
    for result in predictions:
        print(f"\nComment: {result['comment']}")
        print(f"Prediction: {result['prediction']}")
        print(f"Polarity: {result['polarity']:.2f} (Negative to Positive)")
        print(f"Subjectivity: {result['subjectivity']:.2f} (Objective to Subjective)")

if __name__ == "__main__":
    main()

Collecting nrclex
  Downloading NRCLex-4.0-py3-none-any.whl.metadata (3.2 kB)
INFO: pip is looking at multiple versions of nrclex to determine which version is compatible with other requirements. This could take a while.
  Downloading NRCLex-3.0.0.tar.gz (396 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m396.4/396.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nrclex
  Building wheel for nrclex (setup.py) ... [?25l[?25hdone
  Created wheel for nrclex: filename=NRCLex-3.0.0-py3-none-any.whl size=43308 sha256=90ee2675b0815b65c1912ca05935407e39c23d83be709827b46fbcd3a05a230a
  Stored in directory: /root/.cache/pip/wheels/ed/ac/fa/7afddefd14f51c4a963ed291b9052746ed3929473e5a33118d
Successfully built nrclex
Installing collected packages: nrclex
Successfully installed nrclex-3.0.0


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Mounted at /content/drive
Mounted at /content/drive


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_features] = X[numerical_features].fillna(0)


ValueError: Negative values in data passed to MultinomialNB (input X).