In [None]:
# Install dependencies
!pip install textblob
!pip install -U scikit-learn
!pip install nrclex
from google.colab import drive

import sys
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from textblob import TextBlob
from nrclex import NRCLex
import nltk
nltk.download('punkt')
nltk.download('vader_lexicon')

drive.mount('/content/drive', force_remount=True)

# Assuming `CombinedAcceptanceIndexer` is already defined elsewhere
from usage_examples import CombinedAcceptanceIndexer

def mount_drive():
    drive.mount('/content/drive', force_remount=True)

def load_and_filter_data(file_path):
    df = pd.read_csv(file_path, low_memory=False, nrows=1000)
    df.rename(columns={'comments_orig': 'comments'}, inplace=True)
    df['clean_title'] = df['clean_title'].astype(str).str.lower()
    df['comments'] = df['comments'].astype(str)
    df['separated_comment'] = ''
    return df

def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def preprocess_data(df):
    df['comments_polarity'], df['comments_subjectivity'] = zip(*df['comments'].apply(get_sentiment))
    df['clean_title_polarity'], df['clean_title_subjectivity'] = zip(*df['clean_title'].apply(get_sentiment))
    df['separated_comment'] = df['comments'].apply(lambda x: x.split('|__|'))
    # Applying CombinedAcceptanceIndexer to calculate the acceptance index
    df['acceptance_index'] = df.apply(
        lambda row: calculate_acceptance_index(row['clean_title'], row['comments']),
        axis=1
    )

    return df

def calculate_acceptance_index(title, comments):
    if pd.isna(comments) or comments == '':
        return 0
    comment_list = comments.split('|__|')
    indexer = CombinedAcceptanceIndexer(title, comment_list)
    return indexer.calculate_acceptance_index()

def prepare_features_and_labels(df):
    X = df[['clean_title', 'score', 'separated_comment', 'num_comments', 'upvote_ratio',
            'comments_polarity', 'comments_subjectivity', 'clean_title_polarity',
            'clean_title_subjectivity', 'acceptance_index']]

    numerical_features = ['score', 'num_comments', 'upvote_ratio', 'comments_polarity',
                          'comments_subjectivity', 'clean_title_polarity', 'clean_title_subjectivity', 'acceptance_index']

    X[numerical_features] = X[numerical_features].fillna(0)
    y = df['2_way_label'].astype(str)
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_features),
            ('text', TfidfVectorizer(max_features=5000), 'clean_title'),
            #('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['subreddit']),
        ])

    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    return (X_train, X_test, y_train, y_test), label_encoder, preprocessor

def build_pipeline():
    # Using Logistic Regression for classification
    model = Pipeline([
        ('classifier', LogisticRegression(max_iter=500))
    ])
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

def enhanced_predict_new_comments(model, preprocessor, comments, title):
    title_polarity, title_subjectivity = get_sentiment(title)
    df_comments = pd.DataFrame({
        'clean_title': [title] * len(comments),
        'score': [0] * len(comments),
        'separated_comment': [''] * len(comments),
        'num_comments': [0] * len(comments),
        'upvote_ratio': [0] * len(comments),
        'comments_polarity': [TextBlob(c).sentiment.polarity for c in comments],
        'comments_subjectivity': [TextBlob(c).sentiment.subjectivity for c in comments],
        'clean_title_polarity': [title_polarity] * len(comments),
        'clean_title_subjectivity': [title_subjectivity] * len(comments),
        'acceptance_index': [0] * len(comments)
    })

    comments_transformed = preprocessor.transform(df_comments)
    predictions = model.predict(comments_transformed)

    results = []
    for comment, pred in zip(comments, predictions):
        blob = TextBlob(comment)
        results.append({
            'comment': comment,
            'prediction': pred,
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        })

    return results

def main():
    mount_drive()
    file_path = "/content/drive/My Drive/datasets/merged_cleaned_data_v34_no_photo.csv"
    df = load_and_filter_data(file_path)
    processed_df = preprocess_data(df)
    (X_train, X_test, y_train, y_test), label_encoder, preprocessor = prepare_features_and_labels(processed_df)

    model = build_pipeline()
    model.fit(X_train, y_train)

    evaluate_model(model, X_test, y_test)

    new_title = "Shocking Discovery Leaves Internet Divided—Hoax or Breakthrough?"
    new_comments = [
        "This is outrageous! There's no way this can be true!",
        "I can't believe people are falling for this. It's just a hoax.",
        "Great news! This is exactly what we needed.",
        "Interesting perspective, I hadn't considered that before."
    ]

    predictions = enhanced_predict_new_comments(model, preprocessor, new_comments, new_title)
    print("\nEnhanced Predictions with Sentiment Analysis:")
    for result in predictions:
        print(f"\nComment: {result['comment']}")
        print(f"Prediction: {result['prediction']}")
        print(f"Polarity: {result['polarity']:.2f} (Negative to Positive)")
        print(f"Subjectivity: {result['subjectivity']:.2f} (Objective to Subjective)")

if __name__ == "__main__":
    main()




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Mounted at /content/drive
Mounted at /content/drive


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_features] = X[numerical_features].fillna(0)


Accuracy: 0.68
Confusion Matrix:
[[  3  62]
 [  2 133]]
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.05      0.09        65
           1       0.68      0.99      0.81       135

    accuracy                           0.68       200
   macro avg       0.64      0.52      0.45       200
weighted avg       0.66      0.68      0.57       200


Enhanced Predictions with Sentiment Analysis:

Comment: This is outrageous! There's no way this can be true!
Prediction: 1
Polarity: -0.28 (Negative to Positive)
Subjectivity: 0.82 (Objective to Subjective)

Comment: I can't believe people are falling for this. It's just a hoax.
Prediction: 1
Polarity: 0.00 (Negative to Positive)
Subjectivity: 0.00 (Objective to Subjective)

Comment: Great news! This is exactly what we needed.
Prediction: 1
Polarity: 0.62 (Negative to Positive)
Subjectivity: 0.50 (Objective to Subjective)

Comment: Interesting perspective, I hadn't considered that befor

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
