In [None]:
#@title Install necessary libraries
!pip install textblob scikit-learn

#@title Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from textblob import TextBlob
from google.colab import drive

#@title Mount Google Drive
drive.mount('/content/drive')

#@title Define functions

def get_sentiment(text):
    """Analyzes the sentiment of a text using TextBlob."""
    # Convert the input to string to handle potential non-string values
    text = str(text)
    analysis = TextBlob(text)
    return analysis.sentiment.polarity, analysis.sentiment.subjectivity

def preprocess_data(df):
    """Preprocesses the DataFrame by cleaning and adding sentiment features."""
    # Assume 'clean_title' is the text column and 'label' is the target column
    # Fill NaN values in 'clean_title' with empty strings before applying sentiment analysis
    df['clean_title'].fillna('', inplace=True)
    df['clean_title_polarity'], df['clean_title_subjectivity'] = zip(*df['clean_title'].apply(get_sentiment))
    df['comments_polarity'], df['comments_subjectivity'] = zip(*df['clean_title'].apply(get_sentiment))  # Using 'clean_title' for comments as well (adjust if needed)
    return df

def prepare_features_and_labels(df):
    """Prepares features and labels for model training."""
    # Assume 'clean_title' is the text column and 'label' is the target column
    X_text = df[['clean_title']]
    X_num = df[['clean_title_polarity', 'clean_title_subjectivity', 'comments_polarity', 'comments_subjectivity']]
    y = df['2_way_label']

    # Convert labels to strings before encoding
    y = y.astype(str)  # This line is added

    # Encode labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    # Split data
    X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(
        X_text, X_num, y, test_size=0.2, random_state=42
    )
    return (X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test), label_encoder
def build_pipeline():
    """Builds an SVM pipeline with TF-IDF and numerical sentiment features."""
    preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(max_features=5000), 'clean_title'),
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ]), ['clean_title_polarity', 'clean_title_subjectivity', 'comments_polarity', 'comments_subjectivity'])
        ]
    )

    model = Pipeline([
        ('features', preprocessor),
        ('classifier', SVC(kernel='linear', C=1, probability=True, random_state=42))
    ])

    return model

def train_and_evaluate(model, X_train, X_test, y_train, y_test, label_encoder):
    """Trains and evaluates the SVM model."""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"📈 Cross-validation Accuracy: {np.mean(cv_scores):.4f}")

def predict_new_comments(model, new_comments, label_encoder):
    """Predicts labels for new text inputs."""
    new_df = pd.DataFrame({'clean_title': [comment.lower() for comment in new_comments]})
    new_df['clean_title_polarity'], new_df['clean_title_subjectivity'] = zip(*new_df['clean_title'].apply(get_sentiment))
    new_df['comments_polarity'], new_df['comments_subjectivity'] = zip(*new_df['clean_title'].apply(get_sentiment))

    predictions = model.predict(new_df)
    predicted_labels = label_encoder.inverse_transform(predictions)

    print("🔮 Predictions for new comments:", predicted_labels)

#@title Main execution block
def main():
    """Main function to execute the full pipeline."""
    # Replace with your actual file path
    file_path = "/content/drive/My Drive/datasets/merged_cleaned_data_v31_news.csv"

    # Load data (replace with your data loading logic)
    df = pd.read_csv(file_path)

    # Preprocess data
    df = preprocess_data(df)

    # Prepare features and labels
    (X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test), label_encoder = prepare_features_and_labels(df)

    # Merge text and numerical features
    X_train = pd.concat([X_train_text.reset_index(drop=True), X_train_num.reset_index(drop=True)], axis=1)
    X_test = pd.concat([X_test_text.reset_index(drop=True), X_test_num.reset_index(drop=True)], axis=1)

    # Build and train model
    model = build_pipeline()
    train_and_evaluate(model, X_train, X_test, y_train, y_test, label_encoder)

    # Predict new comments
    new_comments = [
        "This is outrageous! There’s no way this can be true! There are no real studies on this.",
        "I can’t believe people are falling for this. It’s just a hoax to get views.",
        "That's totally correct.",
        "I can’t believe people think that's not true"
    ]
    predict_new_comments(model, new_comments, label_encoder)

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['clean_title'].fillna('', inplace=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
               precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      2582
         nan       0.00      0.00      0.00         2

    accuracy                           1.00      2584
   macro avg       0.50      0.50      0.50      2584
weighted avg       1.00      1.00      1.00      2584

Confusion Matrix:
 [[2582    0]
 [   2    0]]
Test Accuracy: 0.9992
📈 Cross-validation Accuracy: 0.9985
🔮 Predictions for new comments: ['1.0' '1.0' '1.0' '1.0']
