In [None]:
# Install necessary libraries
!pip install textblob scikit-learn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from textblob import TextBlob
from google.colab import drive

# ... (rest of your functions: load_data, get_sentiment, process_comments, preprocess_data, prepare_features_and_labels) ...


def build_pipeline():
    """Builds an SVM pipeline with TF-IDF and numerical sentiment features."""
    preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(max_features=5000), 'clean_title'),  # Changed 'text' to 'clean_title'
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ]), ['clean_title_polarity', 'clean_title_subjectivity', 'comments_polarity', 'comments_subjectivity'])
        ]
    )

    model = Pipeline([
        ('features', preprocessor),
        ('classifier', SVC(kernel='linear', C=1, probability=True, random_state=42))
    ])

    return model


def train_and_evaluate(model, X_train, X_test, y_train, y_test, label_encoder):
    """Trains and evaluates the SVM model."""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"📈 Cross-validation Accuracy: {np.mean(cv_scores):.4f}")


def predict_new_comments(model, new_comments, label_encoder):
    """Predicts labels for new text inputs."""
    new_df = pd.DataFrame({'clean_title': [comment.lower() for comment in new_comments]})  # Changed 'text' to 'clean_title'
    new_df['clean_title_polarity'], new_df['clean_title_subjectivity'] = zip(*new_df['clean_title'].apply(get_sentiment))
    new_df['comments_polarity'], new_df['comments_subjectivity'] = zip(*new_df['clean_title'].apply(get_sentiment))  # Changed 'text' to 'clean_title'

    # Add other required columns with default values (if needed)
    # For example, if your model expects 'score', 'separated_comment', etc., add them here
    # new_df['score'] = 0
    # new_df['separated_comment'] = ''
    # ...

    predictions = model.predict(new_df)
    predicted_labels = label_encoder.inverse_transform(predictions)

    print("🔮 Predictions for new comments:", predicted_labels)



def main():
    """Main function to execute the full pipeline."""
    drive.mount('/content/drive', force_remount=True)
    file_path = "/content/drive/My Drive/datasets/merged_cleaned_data_v24_limited.csv"

    # Load and preprocess data
    df = load_data(file_path)
    df = preprocess_data(df)

    # Prepare features and labels
    (X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test), label_encoder = prepare_features_and_labels(df)

    # Merge text and numerical features into a single DataFrame
    X_train = pd.concat([X_train_text.reset_index(drop=True), X_train_num.reset_index(drop=True)], axis=1)
    X_test = pd.concat([X_test_text.reset_index(drop=True), X_test_num.reset_index(drop=True)], axis=1)

    # Build and train model
    model = build_pipeline()
    train_and_evaluate(model, X_train, X_test, y_train, y_test, label_encoder)

    # Predict new comments
    new_comments = [
        "This is outrageous! There’s no way this can be true! There are no real studies on this.",
        "I can’t believe people are falling for this. It’s just a hoax to get views.",
        "That's totally correct.",
        "I can’t believe people think that's not true"
    ]
    predict_new_comments(model, new_comments, label_encoder)

if __name__ == "__main__":
    main()

Mounted at /content/drive
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.76      0.78       103
           1       0.75      0.79      0.77        97

    accuracy                           0.78       200
   macro avg       0.78      0.78      0.77       200
weighted avg       0.78      0.78      0.78       200

Confusion Matrix:
 [[78 25]
 [20 77]]
Test Accuracy: 0.7750
📈 Cross-validation Accuracy: 0.7188
🔮 Predictions for new comments: ['1' '1' '1' '1']
