In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier  
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv(r'C://Users//hp//Downloads//selected_features.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31118 entries, 0 to 31117
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   comment_score       31118 non-null  int64  
 1   controversiality    31118 non-null  int64  
 2   user_is_verified    31118 non-null  float64
 3   user_awardee_karma  31118 non-null  float64
 4   user_awarder_karma  31118 non-null  float64
 5   user_total_karma    31118 non-null  float64
 6   post_score          31118 non-null  int64  
 7   post_upvote_ratio   31118 non-null  float64
 8   total_comments      31118 non-null  int64  
 9   top_words_per_doc   31118 non-null  object 
 10  combined_text       31118 non-null  object 
 11  topics_decoded      31118 non-null  float64
 12  first_topic_title   23817 non-null  object 
 13  sentiment_score     31118 non-null  float64
 14  subreddit_encoded   31118 non-null  int64  
 15  tokens              31118 non-null  object 
dtypes: f

In [3]:
# Split the dataset into features (X) and target variable (y)
X_numeric = data.drop(columns=['controversiality', 'tokens', 'top_words_per_doc', 'combined_text', 
                               'user_is_verified', 'user_awarder_karma', 'user_total_karma', 'user_awardee_karma',
                              'first_topic_title'])
X_text = data['combined_text']
y = data['controversiality']

In [4]:
X_numeric.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31118 entries, 0 to 31117
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   comment_score      31118 non-null  int64  
 1   post_score         31118 non-null  int64  
 2   post_upvote_ratio  31118 non-null  float64
 3   total_comments     31118 non-null  int64  
 4   topics_decoded     31118 non-null  float64
 5   sentiment_score    31118 non-null  float64
 6   subreddit_encoded  31118 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 1.7 MB


In [5]:
# Split the data into train and test sets without shuffling
X_numeric_train, X_numeric_test, X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_numeric, X_text, y, test_size=0.2, random_state=42)

In [7]:
# Define transformations for numeric and categorical columns
numeric_features = ['comment_score','post_score', 'post_upvote_ratio', 'total_comments', 'sentiment_score']

numeric_categorical_features = ['subreddit_encoded']

numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))  # Use drop='first' to avoid multicollinearity
])

In [8]:
# Combine numeric and categorical transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, numeric_categorical_features)
    ])

In [9]:
'''from sklearn.manifold import TSNE

# Fit t-SNE after preprocessing
tsne = TSNE(n_components=2, random_state=42, init="random")

# Preprocess and fit the t-SNE model
X_train_processed = preprocessor.fit_transform(X_numeric_train)

# Fit t-SNE on the preprocessed data
X_tsne_train = tsne.fit_transform(X_train_processed)

# Plot class separability
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_tsne_train[:, 0], y=X_tsne_train[:, 1], hue=y_train, palette='Set1', alpha=0.8)
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('Class Separability Plot using t-SNE')
plt.legend(title='Controversiality')
plt.show()'''

'from sklearn.manifold import TSNE\n\n# Fit t-SNE after preprocessing\ntsne = TSNE(n_components=2, random_state=42, init="random")\n\n# Preprocess and fit the t-SNE model\nX_train_processed = preprocessor.fit_transform(X_numeric_train)\n\n# Fit t-SNE on the preprocessed data\nX_tsne_train = tsne.fit_transform(X_train_processed)\n\n# Plot class separability\nplt.figure(figsize=(10, 6))\nsns.scatterplot(x=X_tsne_train[:, 0], y=X_tsne_train[:, 1], hue=y_train, palette=\'Set1\', alpha=0.8)\nplt.xlabel(\'t-SNE Component 1\')\nplt.ylabel(\'t-SNE Component 2\')\nplt.title(\'Class Separability Plot using t-SNE\')\nplt.legend(title=\'Controversiality\')\nplt.show()'

In [10]:
'''import numpy as np

# Preprocess text data
X_text_train_transformed = text_transformer.fit_transform(X_text_train)

# Concatenate processed numeric and text data
X_train_processed_concatenated = np.concatenate((X_train_processed.toarray(), X_text_train_transformed.toarray()), axis=1)

# Fit t-SNE on the concatenated preprocessed data
X_tsne_train_concatenated = tsne.fit_transform(X_train_processed_concatenated)

# Plot class separability for the concatenated data
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_tsne_train_concatenated[:, 0], y=X_tsne_train_concatenated[:, 1], hue=y_train, palette='Set1', alpha=0.8)
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('Class Separability Plot using t-SNE (Numeric + Text)')
plt.legend(title='Controversiality')
plt.show()'''

"import numpy as np\n\n# Preprocess text data\nX_text_train_transformed = text_transformer.fit_transform(X_text_train)\n\n# Concatenate processed numeric and text data\nX_train_processed_concatenated = np.concatenate((X_train_processed.toarray(), X_text_train_transformed.toarray()), axis=1)\n\n# Fit t-SNE on the concatenated preprocessed data\nX_tsne_train_concatenated = tsne.fit_transform(X_train_processed_concatenated)\n\n# Plot class separability for the concatenated data\nplt.figure(figsize=(10, 6))\nsns.scatterplot(x=X_tsne_train_concatenated[:, 0], y=X_tsne_train_concatenated[:, 1], hue=y_train, palette='Set1', alpha=0.8)\nplt.xlabel('t-SNE Component 1')\nplt.ylabel('t-SNE Component 2')\nplt.title('Class Separability Plot using t-SNE (Numeric + Text)')\nplt.legend(title='Controversiality')\nplt.show()"

In [11]:
# Define the text transformer
text_transformer = TfidfVectorizer(max_features=12000)

In [12]:
# Define RandomForestClassifier pipeline
xg_boost_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(scale_pos_weight=1.5)) 
])

naive_bayes_model = Pipeline([
    ('text', text_transformer),
    ('classifier', MultinomialNB())
])

In [13]:
# Define and fit logistic regression model
naive_bayes_model.fit(X_text_train, y_train)
xg_boost_model.fit(X_numeric_train, y_train)

In [14]:
# Train base models and generate predictions on validation set
xg_boost_preds_val = xg_boost_model.predict(X_numeric_test)
naive_bayes_preds_val = naive_bayes_model.predict(X_text_test)

# Calculate accuracy of each base model on validation set
xg_boost_accuracy = accuracy_score(y_test, xg_boost_preds_val)
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_preds_val)

print(xg_boost_accuracy)
print(naive_bayes_accuracy)

# Assign weights based on performance
total_accuracy = xg_boost_accuracy + naive_bayes_accuracy
xg_boost_weight = xg_boost_accuracy / total_accuracy
naive_bayes_weight = naive_bayes_accuracy / total_accuracy

print(xg_boost_weight)
print(naive_bayes_weight)

0.8473650385604113
0.7180269922879178
0.5413117109719798
0.4586882890280201


In [15]:
# Combine predictions using weighted averaging
weighted_avg_preds = (xg_boost_weight * xg_boost_preds_val) + \
                     (naive_bayes_weight * naive_bayes_preds_val)

In [16]:
# Define a threshold to convert continuous predictions to binary labels
threshold = 0.5  # Adjust the threshold as needed

# Convert continuous predictions to binary labels based on the threshold
binary_preds = (weighted_avg_preds > threshold).astype(int)

# Evaluate performance of ensemble model using appropriate evaluation metric
# For regression tasks, you can use mean squared error (MSE) or mean absolute error (MAE)
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, binary_preds)
print("Mean Squared Error (MSE) of Ensemble Model:", mse)

# Calculate mean absolute error (MAE)
mae = mean_absolute_error(y_test, binary_preds)
print("Mean Absolute Error (MAE) of Ensemble Model:", mae)

Mean Squared Error (MSE) of Ensemble Model: 0.15263496143958868
Mean Absolute Error (MAE) of Ensemble Model: 0.15263496143958868


In [17]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

# Evaluate precision, recall, F1 score, and support
print("Classification Report:")
print(classification_report(y_test, binary_preds))

# Compute confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, binary_preds))

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, weighted_avg_preds)
auc = roc_auc_score(y_test, weighted_avg_preds)
print("AUC:", auc)

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      4221
           1       0.76      0.76      0.76      2003

    accuracy                           0.85      6224
   macro avg       0.83      0.83      0.83      6224
weighted avg       0.85      0.85      0.85      6224

Confusion Matrix:
[[3747  474]
 [ 476 1527]]
AUC: 0.8393764482392735
