In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer # Import ColumnTransformer
from sklearn.impute import SimpleImputer # Import SimpleImputer
from textblob import TextBlob
from google.colab import drive


def load_data(file_path, sample_size=1000):
    df = pd.read_csv(file_path, low_memory=False).sample(n=sample_size, random_state=42)
    df.rename(columns={'comments_orig': 'comments'}, inplace=True)
    df['clean_title'] = df['clean_title'].astype(str).str.lower()
    df['comments'] = df['comments'].astype(str)
    df['separated_comment'] = ''  # Initialize as empty strings
    return df

def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def process_comments(row):
    try:
        comments = row['comments'].split('|__|')
    except AttributeError:
        return 0, 0
    polarities, subjectivities = zip(*[get_sentiment(str(comment)) for comment in comments]) if comments else ([0], [0])
    return np.mean(polarities), np.mean(subjectivities)

def preprocess_data(df):
    df['comments_polarity'], df['comments_subjectivity'] = zip(*df.apply(process_comments, axis=1))
    df['clean_title_polarity'], df['clean_title_subjectivity'] = zip(*df['clean_title'].apply(get_sentiment))
    df['separated_comment'] = df['comments'].apply(lambda x: x.split('|__|'))
    return df

def prepare_features_and_labels(df):
    # Features and label selection
    X = df[['clean_title', 'score', 'separated_comment', 'num_comments', 'upvote_ratio']]
    y = df['2_way_label'].astype(str)


    # Now split the data after encoding
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Label encoding for the target variable
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)

    return (X_train, X_test, y_train, y_test), label_encoder

def build_pipeline():
    # Create a ColumnTransformer to apply TfidfVectorizer to text columns and impute numerical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('title_tfidf', TfidfVectorizer(max_features=5000), 'clean_title'),
            # Impute missing values in numerical features with the mean
            ('num_features', make_pipeline(SimpleImputer(strategy='mean'), 'passthrough'), ['score', 'num_comments', 'upvote_ratio'] + [col for col in X_train.columns if col.startswith('subreddit_')]), # Passthrough numerical and one-hot encoded features
        ],
        remainder='drop'  # Drop features not specified
    )

    # Combine the preprocessor with the classifier in a pipeline
    pipeline = make_pipeline(
        preprocessor,
        LogisticRegression(max_iter=5000, random_state=42),
    )

    return pipeline

def evaluate_model(model, X_test, y_test, label_encoder):
    y_pred = model.predict(X_test)
    print("Classification Report:")
    unique_labels = np.unique(y_test)
    target_names = [label_encoder.classes_[i] for i in unique_labels]
    print(classification_report(y_test, y_pred, target_names=target_names))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred)}")

def predict_new_comments(model, label_encoder, comments,title):
    # Create a DataFrame with the same structure as the training data
    new_comments_df = pd.DataFrame({'clean_title': title * len(comments),
                                     'score': [0] * len(comments),
                                     'separated_comment': [0] * len(comments),
                                     'num_comments': [0] * len(comments),
                                     'upvote_ratio': [0] * len(comments),
                                     'separated_comment': [''] * len(comments)})
    predictions = model.predict(new_comments_df)
    return label_encoder.inverse_transform(predictions)

def main():
    # Mount Google Drive
    drive.mount('/content/drive', force_remount=True)
    file_path = "/content/drive/My Drive/datasets/merged_cleaned_data_v34_no_photo.csv"

    # Load and preprocess data
    df = load_data(file_path)
    df = preprocess_data(df)
    global X_train # Make X_train global to access in build_pipeline
    (X_train, X_test, y_train, y_test), label_encoder = prepare_features_and_labels(df)

    # Train the model
    model = build_pipeline()
    model.fit(X_train, y_train)

    # Evaluate the model
    evaluate_model(model, X_test, y_test, label_encoder)

    # Predict new comments
    new_title = "Shocking Discovery Leaves Internet Divided—Hoax or Breakthrough?"
    new_comments = ["This is outrageous! There’s no way this can be true!",
                    "I can’t believe people are falling for this. It’s just a hoax."]
    print("Predictions for new comments:", predict_new_comments(model, label_encoder, new_comments,new_title))

if __name__ == "__main__":
    main()

Mounted at /content/drive
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.02      0.03        59
           1       0.71      1.00      0.83       141

    accuracy                           0.71       200
   macro avg       0.85      0.51      0.43       200
weighted avg       0.79      0.71      0.59       200

Confusion Matrix:
[[  1  58]
 [  0 141]]
Test Accuracy: 0.71
Predictions for new comments: ['1' '1']
