In [None]:
# Install necessary libraries
!pip install textblob
!pip install -U scikit-learn

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from google.colab import drive

def mount_drive():
    drive.mount('/content/drive', force_remount=True)

def load_data(file_path, sample_size=1000):
    df = pd.read_csv(file_path, low_memory=False).sample(n=sample_size, random_state=42)
    df['clean_title'] = df['clean_title'].astype(str).str.lower()
    df['comments'] = df['comments'].astype(str)
    return df

def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def process_comments(row):
    try:
        comments = row['comments'].split('|__|')
    except AttributeError:
        return 0, 0
    polarities, subjectivities = zip(*[get_sentiment(str(comment)) for comment in comments]) if comments else ([0], [0])
    return np.mean(polarities), np.mean(subjectivities)

def preprocess_data(df):
    df['comments_polarity'], df['comments_subjectivity'] = zip(*df.apply(process_comments, axis=1))
    df['clean_title_polarity'], df['clean_title_subjectivity'] = zip(*df['clean_title'].apply(get_sentiment))
    df['separated_comment'] = df['comments'].apply(lambda x: x.split('|__|'))
    return df

def prepare_features_and_labels(df):
    X = df[['clean_title', 'score', 'separated_comment', 'subreddit', 'num_comments', 'upvote_ratio']]
    y = df['2_way_label'].astype(str)
    label_encoder = LabelEncoder()
    # Fit label encoder on all data to ensure it covers all classes
    # y = label_encoder.fit_transform(pd.concat([df['2_way_label'],pd.Series(['true', 'false', 'misleading'])]).astype(str).unique()) #Fit on all possible classes -- This was incorrect
    # Instead, fit on all possible classes and then transform the original y
    all_possible_classes = pd.concat([df['2_way_label'],pd.Series(['true', 'false', 'misleading'])]).astype(str).unique()
    label_encoder.fit(all_possible_classes)
    y = label_encoder.transform(y)  # Transform the original y

    # Check for classes with only one sample
    unique_classes, counts = np.unique(y, return_counts=True)
    classes_with_one_sample = unique_classes[counts == 1]

    # If there are classes with only one sample, remove them from the dataset
    if len(classes_with_one_sample) > 0:
        print(f"Warning: Removing classes with only one sample: {label_encoder.inverse_transform(classes_with_one_sample)}")
        mask = ~np.isin(y, classes_with_one_sample)
        X = X[mask]
        y = y[mask]

    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y), label_encoder # Stratify to ensure class distribution is maintained in train/test sets

def build_pipeline():
    return Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

def evaluate_model(model, X_test, y_test, label_encoder):
    y_pred = model.predict(X_test)
    # Get unique labels in y_test and y_pred
    unique_labels = np.unique(np.concatenate((y_test, y_pred)))
    # Filter label_encoder.classes_ to include only the present labels
    target_names = [label_encoder.classes_[i] for i in unique_labels]
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=target_names)) # Use filtered target_names
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred)}")

def predict_new_comments(model, label_encoder, comments):
    predictions = model.predict(comments)
    return label_encoder.inverse_transform(predictions)

def main():
    mount_drive()
    file_path = "/content/drive/My Drive/datasets/merged_data_v14_features.csv" # Replace with your actual file path
    df = preprocess_data(load_data(file_path))
    (X_train, X_test, y_train, y_test), label_encoder = prepare_features_and_labels(df)
    model = build_pipeline()
    model.fit(X_train['clean_title'], y_train)
    evaluate_model(model, X_test['clean_title'], y_test, label_encoder)
    new_comments = ["This is outrageous! There’s no way this can be true!", "I can’t believe people are falling for this. It’s just a hoax."]
    print("Predictions for new comments:", predict_new_comments(model, label_encoder, new_comments))

if __name__ == "__main__":
    main()

Mounted at /content/drive
Classification Report:
              precision    recall  f1-score   support

         0.0       0.63      0.92      0.75       110
         1.0       0.77      0.33      0.47        90

    accuracy                           0.66       200
   macro avg       0.70      0.63      0.61       200
weighted avg       0.69      0.66      0.62       200

Confusion Matrix:
[[101   9]
 [ 60  30]]
Test Accuracy: 0.655
Predictions for new comments: ['0.0' '0.0']
