In [None]:
# Install necessary libraries
!pip install textblob scikit-learn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from textblob import TextBlob
from google.colab import drive

# ... (rest of your functions: load_data, get_sentiment, process_comments, preprocess_data, prepare_features_and_labels) ...

def load_data(file_path):
    """Loads data from a CSV file into a Pandas DataFrame."""
    df = pd.read_csv(file_path)
    return df

def get_sentiment(text):
    """Analyzes the sentiment of a text using TextBlob."""
    analysis = TextBlob(text)
    return analysis.sentiment.polarity, analysis.sentiment.subjectivity

def process_comments(comments):
    """Performs basic cleaning on comments text."""
    # Example: Lowercasing
    # Convert comments to strings and handle missing values
    comments = comments.astype(str).str.lower()  # Explicitly convert to strings
    comments = comments.replace('nan', '', regex=True)  # Replace 'nan' strings with empty strings
    return comments

def preprocess_data(df):
    """Applies preprocessing steps to the DataFrame."""
    # Example: Clean comments
    df['clean_title'] = process_comments(df['title'])

    # Add sentiment features
    # Handle potential errors while getting sentiment by skipping non-string values
    df['clean_title_polarity'], df['clean_title_subjectivity'] = zip(*df['clean_title'].apply(lambda x: get_sentiment(x) if isinstance(x, str) else (np.nan, np.nan)))
    df['comments_polarity'], df['comments_subjectivity'] = zip(*df['clean_title'].apply(lambda x: get_sentiment(x) if isinstance(x, str) else (np.nan, np.nan)))  # Sentiment of comments using clean title for now
    return df

def prepare_features_and_labels(df):
    """Prepares features and labels for model training."""
    # Assume 'label' column exists for target variable
    X = df[['clean_title', 'clean_title_polarity', 'clean_title_subjectivity', 'comments_polarity', 'comments_subjectivity']]
    y = df['label']

    # Encode labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Separate text and numerical features
    X_train_text = X_train[['clean_title']]
    X_test_text = X_test[['clean_title']]
    X_train_num = X_train[['clean_title_polarity', 'clean_title_subjectivity', 'comments_polarity', 'comments_subjectivity']]
    X_test_num = X_test[['clean_title_polarity', 'clean_title_subjectivity', 'comments_polarity', 'comments_subjectivity']]

    return (X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test), label_encoder

def build_pipeline():
    """Builds an SVM pipeline with TF-IDF and numerical sentiment features."""
    preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(max_features=5000), 'clean_title'),  # Changed 'text' to 'clean_title'
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ]), ['clean_title_polarity', 'clean_title_subjectivity', 'comments_polarity', 'comments_subjectivity'])
        ]
    )

    model = Pipeline([
        ('features', preprocessor),
        ('classifier', SVC(kernel='linear', C=1, probability=True, random_state=42))
    ])

    return model


def train_and_evaluate(model, X_train, X_test, y_train, y_test, label_encoder):
    """Trains and evaluates the SVM model."""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"📈 Cross-validation Accuracy: {np.mean(cv_scores):.4f}")


def predict_new_comments(model, new_comments, label_encoder):
    """Predicts labels for new text inputs."""
    new_df = pd.DataFrame({'clean_title': [comment.lower() for comment in new_comments]})  # Changed 'text' to 'clean_title'
    new_df['clean_title_polarity'], new_df['clean_title_subjectivity'] = zip(*new_df['clean_title'].apply(get_sentiment))
    new_df['comments_polarity'], new_df['comments_subjectivity'] = zip(*new_df['clean_title'].apply(get_sentiment))  # Changed 'text' to 'clean_title'

    # Add other required columns with default values (if needed)
    # For example, if your model expects 'score', 'separated_comment', etc., add them here
    # new_df['score'] = 0
    # new_df['separated_comment'] = ''
    # ...

    predictions = model.predict(new_df)
    predicted_labels = label_encoder.inverse_transform(predictions)

    print("🔮 Predictions for new comments:", predicted_labels)



def main():
    """Main function to execute the full pipeline."""
    drive.mount('/content/drive', force_remount=True)
    file_path = "/content/drive/My Drive/datasets/merged_cleaned_data_v31_news.csv"

    # Load and preprocess data
    df = load_data(file_path)
    df = preprocess_data(df)

    # Prepare features and labels
    (X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test), label_encoder = prepare_features_and_labels(df)

    # Merge text and numerical features into a single DataFrame
    X_train = pd.concat([X_train_text.reset_index(drop=True), X_train_num.reset_index(drop=True)], axis=1)
    X_test = pd.concat([X_test_text.reset_index(drop=True), X_test_num.reset_index(drop=True)], axis=1)

    # Build and train model
    model = build_pipeline()
    train_and_evaluate(model, X_train, X_test, y_train, y_test, label_encoder)

    # Predict new comments
    new_comments = [
        "This is outrageous! There’s no way this can be true! There are no real studies on this.",
        "I can’t believe people are falling for this. It’s just a hoax to get views.",
        "That's totally correct.",
        "I can’t believe people think that's not true"
    ]
    predict_new_comments(model, new_comments, label_encoder)

if __name__ == "__main__":
    main()

Mounted at /content/drive


KeyError: 'label'

In [None]:
!pip install textblob
!pip install -U scikit-learn
from google.colab import drive

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # Import OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.compose import ColumnTransformer # Import ColumnTransformer
from textblob import TextBlob



def mount_drive():
    drive.mount('/content/drive', force_remount=True)

def load_data(file_path):
    df = pd.read_csv(file_path, low_memory=False)
    df['clean_title'] = df['clean_title'].astype(str).str.lower()
    df['comments'] = df['comments'].astype(str)
    df['separated_comment'] = ''  # Initialize as empty strings
    return df

def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def process_comments(row):
    try:
        comments = row['comments'].split('|__|')
    except AttributeError:
        return 0, 0
    polarities, subjectivities = zip(*[get_sentiment(str(comment)) for comment in comments]) if comments else ([0], [0])
    return np.mean(polarities), np.mean(subjectivities)

def preprocess_data(df):
    df['comments_polarity'], df['comments_subjectivity'] = zip(*df.apply(process_comments, axis=1))
    df['clean_title_polarity'], df['clean_title_subjectivity'] = zip(*df['clean_title'].apply(get_sentiment))
    df['separated_comment'] = df['comments'].apply(lambda x: x.split('|__|'))
    return df

def prepare_features_and_labels(df):
    # Select features for the model
    X = df[['clean_title', 'score','separated_comment','subreddit', 'num_comments', 'upvote_ratio']]

    # *** Handle missing values (NaNs) *** Impute numerical features first
    numerical_features = ['score', 'num_comments', 'upvote_ratio']
    for feature in numerical_features:
        X[feature] = pd.to_numeric(X[feature], errors='coerce')  # Convert to numeric, invalid parsing will be set as NaN
    X[numerical_features] = X[numerical_features].fillna(X[numerical_features].mean())


    # Target variable
    y = df['2_way_label'].astype(str)
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)


    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Create a ColumnTransformer to apply TF-IDF to text data,
    # passthrough for numerical features, and OneHotEncode 'subreddit'
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', ['score', 'num_comments', 'upvote_ratio']),
            ('text', TfidfVectorizer(max_features=5000), 'clean_title'),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['subreddit']), # Applying OneHotEncoder to 'subreddit'
        ])

    # Fit and transform the training data
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    # *** Clip negative values to 0 ***
    # This ensures all values passed to MultinomialNB are non-negative
    X_train = X_train.toarray()  # Convert to dense array if necessary
    X_test = X_test.toarray()    # Convert to dense array if necessary

    X_train[X_train < 0] = 0
    X_test[X_test < 0] = 0

    return (X_train, X_test, y_train, y_test), label_encoder, preprocessor # Return preprocessor


def build_pipeline():
    # No changes here, the pipeline uses the preprocessor now
    return Pipeline([
        ('classifier', MultinomialNB())
    ])

def evaluate_model(model, X_test, y_test, label_encoder):
    y_pred = model.predict(X_test)
    print("Classification Report:")
    unique_labels = np.unique(y_test)
    target_names = label_encoder.classes_[unique_labels]
    print(classification_report(y_test, y_pred, target_names=target_names))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred)}")

def predict_new_comments(model, label_encoder, preprocessor, comments):
    # Preprocess the new comments using the same preprocessor
    comments_transformed = preprocessor.transform(pd.DataFrame({'clean_title': comments,
                                                                'score': [0] * len(comments),  # Dummy values
                                                                'subreddit': ['unknown'] * len(comments),  # Use 'unknown' or another placeholder string
                                                                'num_comments': [0] * len(comments),  # Dummy values
                                                                'upvote_ratio': [0] * len(comments)}))  # Dummy values
    predictions = model.predict(comments_transformed)
    return label_encoder.inverse_transform(predictions)

def main():
    mount_drive()
    file_path = "/content/drive/My Drive/datasets/merged_cleaned_data_v31_news.csv"
    df = preprocess_data(load_data(file_path))
    (X_train, X_test, y_train, y_test), label_encoder, preprocessor = prepare_features_and_labels(df) # Get preprocessor
    model = build_pipeline()
    model.fit(X_train, y_train)
    evaluate_model(model, X_test, y_test, label_encoder)
    new_comments = ["This is outrageous! There’s no way this can be true!", "I can’t believe people are falling for this. It’s just a hoax."]
    # Pass the preprocessor to predict_new_comments
    print("Predictions for new comments:", predict_new_comments(model, label_encoder, preprocessor, new_comments))

if __name__ == "__main__":
    main()

Mounted at /content/drive


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = pd.to_numeric(X[feature], errors='coerce')  # Convert to numeric, invalid parsing will be set as NaN
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_features] = X[numerical_features].fillna(X[numerical_features].mean())


Classification Report:
              precision    recall  f1-score   support

         1.0       1.00      0.90      0.95      2582
         nan       0.00      0.00      0.00         2

    accuracy                           0.90      2584
   macro avg       0.50      0.45      0.47      2584
weighted avg       1.00      0.90      0.95      2584

Confusion Matrix:
[[2326  256]
 [   2    0]]
Test Accuracy: 0.9001547987616099
Predictions for new comments: ['nan' 'nan']
