In [5]:
!pip install textblob
!pip install -U scikit-learn
!pip install nrclex
from google.colab import drive

import sys
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.compose import ColumnTransformer
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nrclex import NRCLex
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

drive.mount('/content/drive',force_remount=True)
# Add the 'libs' directory to the system path
drive_libs_path = '/content/drive/MyDrive/libs'
sys.path.append(drive_libs_path)

from usage_examples import get_acceptance_indexes, CombinedAcceptanceIndexer
from noise_filter import NoiseFilter

def mount_drive():
  drive.mount('/content/drive', force_remount=True)

def load_and_filter_data(file_path):
    df = pd.read_csv(file_path, low_memory=False)

    # Apply noise filters (using example)
    filter_config = {
        'MinLengths': {
            'clean_title': 5,
            'comments': 5
        },
        'ExcludeImages': True
    }

    noise_filter = NoiseFilter(df, filter_config)
    filtered_df = noise_filter.apply()

    # Standard preprocessing
    filtered_df['clean_title'] = filtered_df['clean_title'].astype(str).str.lower()
    filtered_df['comments'] = filtered_df['comments'].astype(str)
    filtered_df['separated_comment'] = ''

    return filtered_df

def get_sentiment(text):
    # Assuming you have loaded the appropriate sentiment model
    #blob = TextBlob(text)
    analyzer = SentimentIntensityAnalyzer()  # Uncomment for Vader sentiment if installed and needed
    scores = analyzer.polarity_scores(text)
    polarity = scores['compound']  # For Vader sentiment
    subjectivity = scores['neu']    # For Vader sentiment
    return polarity, subjectivity
    #return blob.sentiment.polarity, blob.sentiment.subjectivity

def enhanced_preprocess_data(df):
    # Assuming process_comments and get_sentiment are defined and imported
    df['comments_polarity'], df['comments_subjectivity'] = zip(*df['comments'].apply(get_sentiment)) # Assuming process_comments is replaced with get_sentiment for comments
    df['clean_title_polarity'], df['clean_title_subjectivity'] = zip(*df['clean_title'].apply(get_sentiment))
    df['separated_comment'] = df['comments'].apply(lambda x: x.split('|__|'))
    # Calculate average comment polarity

    # Add acceptance index features
    df['acceptance_index'] = df.apply(
        lambda row: calculate_acceptance_index(row['clean_title'], row['comments']),
        axis=1
    )

    return df

def calculate_acceptance_index(title, comments):
    if pd.isna(comments) or comments == '':
        return 0

    comment_list = comments.split('|__|')
    indexer = CombinedAcceptanceIndexer(title, comment_list)
    return indexer.calculate_acceptance_index()


def prepare_features_and_labels(df):
    X = df[['clean_title', 'score', 'separated_comment',
           'upvote_ratio', 'comments_polarity',
           'comments_subjectivity', 'clean_title_polarity',
           'clean_title_subjectivity', 'acceptance_index']]

    numerical_features = ['score', 'upvote_ratio',
                         'comments_polarity', 'comments_subjectivity',
                         'clean_title_polarity', 'clean_title_subjectivity',
                         'acceptance_index']

    # Impute missing numerical values with 0 (or the mean if preferred)
    for feature in numerical_features:
        if feature not in df.columns:
            df[feature] = 0  # Create a column filled with 0 if the feature is missing
        else:
          df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)  # Convert to numeric and fill NaN with 0
          X[feature] = pd.to_numeric(X[feature], errors='coerce').fillna(0).apply(lambda x: max(0, x))  # Clip values at 0 ensuring non-negativity

   # X[numerical_features] = X[numerical_features].fillna(0)

    y = df['2_way_label'].astype(str)
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    # Create the ColumnTransformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_features),
            ('text', TfidfVectorizer(max_features=5000), 'clean_title')
        ])

    return X, y, label_encoder, preprocessor
def enhanced_predict_new_comments(model, label_encoder, preprocessor, comments,title):
    analyzer = SentimentIntensityAnalyzer()
    df_comments = pd.DataFrame({
        'clean_title': [title] * len(comments),
        'score': [0] * len(comments),
        'upvote_ratio': [0] * len(comments),
        'separated_comment': [''] * len(comments),
        'comments_polarity': [analyzer.polarity_scores(c)['compound'] for c in comments],
        'comments_subjectivity': [1 - analyzer.polarity_scores(c)['neu'] for c in comments],
        'clean_title_polarity': [analyzer.polarity_scores(title)['compound']] * len(comments),  # Replicate to match length
        'clean_title_subjectivity': [1 - analyzer.polarity_scores(title)['neu']] * len(comments),  # Replicate to match length
        'acceptance_index': [0] * len(comments)
    })

    comments_transformed = preprocessor.transform(df_comments)
    predictions = model.predict(comments_transformed)

    results = []
    for comment, pred in zip(comments, predictions):
        scores = analyzer.polarity_scores(comment)
        results.append({
            'comment': comment,
            'prediction': label_encoder.inverse_transform([pred])[0],
            'polarity': scores['compound'],  # Vader compound score for polarity
            'subjectivity': 1 - scores['neu']  # 1 - neutrality as proxy for subjectivity
        })

    return results

def build_pipeline():
  model = Pipeline([
      ('classifier', MultinomialNB())  # Example: using Multinomial Naive Bayes
  ])
  return model

def evaluate_model(model, X_test, y_test, label_encoder):
  y_pred = model.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
  print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

def main():
    # Mount drive and load data
    mount_drive()
    file_path = "/content/drive/My Drive/datasets/merged_cleaned_data_v31_news.csv"  # Update with your file path
    df = load_and_filter_data(file_path)

    # Enhanced preprocessing
    processed_df = enhanced_preprocess_data(df)

    # Prepare features and labels
    X, y, label_encoder, preprocessor = prepare_features_and_labels(processed_df)

    # Stratified K-Fold Cross-Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # 5 folds

    accuracy_scores = []
    classification_reports = []

    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        print(f"Fold {fold + 1}:")
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Build and train the model
        model = build_pipeline()
        model.fit(preprocessor.fit_transform(X_train), y_train)  # Apply preprocessor here

        # Evaluate the model
        y_pred = model.predict(preprocessor.transform(X_test))
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

        accuracy_scores.append(accuracy)
        classification_reports.append(report)

        print(f"Accuracy: {accuracy:.2f}")
        print(report)
        print("-" * 30)

    # Average accuracy across folds
    avg_accuracy = np.mean(accuracy_scores)
    print(f"Average Accuracy across all folds: {avg_accuracy:.2f}")

    # Predict on new comments with sentiment analysis
    new_title = "Shocking Discovery Leaves Internet Divided—Hoax or Breakthrough?"
    new_comments = [
        "This is outrageous! There's no way this can be true!",
        "I can't believe people are falling for this. It's just a hoax.",
        "Great news! This is exactly what we needed.",
        "Interesting perspective, I hadn't considered that before."
    ]

    # Retrain the model on the entire dataset for prediction
    final_model = build_pipeline()
    final_model.fit(preprocessor.fit_transform(X), y)

    predictions = enhanced_predict_new_comments(final_model, label_encoder, preprocessor, new_comments, new_title)

    print("\nEnhanced Predictions with Sentiment Analysis:")
    for result in predictions:
        print(f"\nComment: {result['comment']}")
        print(f"Prediction: {result['prediction']}")
        print(f"Polarity: {result['polarity']:.2f} (Negative to Positive)")
        print(f"Subjectivity: {result['subjectivity']:.2f} (Objective to Subjective)")

if __name__ == "__main__":
    main()



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Mounted at /content/drive
Mounted at /content/drive


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = pd.to_numeric(X[feature], errors='coerce').fillna(0).apply(lambda x: max(0, x))  # Clip values at 0 ensuring non-negativity


Fold 1:
Accuracy: 1.00
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      2271

    accuracy                           1.00      2271
   macro avg       1.00      1.00      1.00      2271
weighted avg       1.00      1.00      1.00      2271

------------------------------
Fold 2:
Accuracy: 1.00
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      2271

    accuracy                           1.00      2271
   macro avg       1.00      1.00      1.00      2271
weighted avg       1.00      1.00      1.00      2271

------------------------------
Fold 3:
Accuracy: 1.00
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      2270

    accuracy                           1.00      2270
   macro avg       1.00      1.00      1.00      2270
weighted avg       1.00      1.00      1.00      2270

------------------------------
Fold 4:
Accuracy: 1