In [27]:
# Load the dataset
import pandas as pd

# Data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import re

# Model selection
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb


# Model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


df = pd.read_csv('Downloads/Youtube01-Psy.csv')


In [18]:
# Clean the comments

def clean_comment(comment):
    # convert to lowercase
    comment = comment.lower()

    # remove urls
    comment = re.sub(r'http\S+', '', comment)

    # remove digits
    comment = re.sub(r'\d+', '', comment)

    # remove punctuation
    comment = re.sub(r'[^\w\s]', '', comment)

    # remove extra whitespace
    comment = re.sub(r'\s+', ' ', comment).strip()

    return comment

# clean the 'comment_text' column
df['CONTENT'] = df['CONTENT'].apply(clean_comment)

# print the first 5 cleaned comments
print(df['CONTENT'].head())


0    huh anyway check out this youtube channel koby...
1    hey guys check out my new channel and our firs...
2                just for test i have to say murdevcom
3         me shaking my sexy ass on my channel enjoy _
4                     watchvvtarggvgtwq check this out
Name: CONTENT, dtype: object


In [19]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['CONTENT'], df['CLASS'], test_size=0.2, random_state=42)


In [20]:
# Feature extraction
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [21]:
# Label encoding
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)


In [22]:
# Train different models
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)


LogisticRegression()

In [23]:
svm_model = SVC(kernel='linear', C=1.0, probability=True)
svm_model.fit(X_train, y_train)

SVC(kernel='linear', probability=True)

In [24]:
dt_model = DecisionTreeClassifier(max_depth=10)
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10)

In [25]:
# Evaluate the models on the testing set
lr_preds = lr_model.predict(X_test)
svm_preds = svm_model.predict(X_test)
dt_preds = dt_model.predict(X_test)

lr_accuracy = accuracy_score(y_test, lr_preds)
lr_precision = precision_score(y_test, lr_preds)
lr_recall = recall_score(y_test, lr_preds)
lr_f1_score = f1_score(y_test, lr_preds)

svm_accuracy = accuracy_score(y_test, svm_preds)
svm_precision = precision_score(y_test, svm_preds)
svm_recall = recall_score(y_test, svm_preds)
svm_f1_score = f1_score(y_test, svm_preds)

dt_accuracy = accuracy_score(y_test, dt_preds)
dt_precision = precision_score(y_test, dt_preds)
dt_recall = recall_score(y_test, dt_preds)
dt_f1_score = f1_score(y_test, dt_preds)

print('Logistic Regression accuracy:', lr_accuracy)
print('Logistic Regression precision:', lr_precision)
print('Logistic Regression recall:', lr_recall)
print('Logistic Regression f1 score:', lr_f1_score)

print('SVM accuracy:', svm_accuracy)
print('SVM precision:', svm_precision)
print('SVM recall:', svm_recall)
print('SVM f1 score:', svm_f1_score)

print('Decision Tree accuracy:', dt_accuracy)
print('Decision Tree precision:', dt_precision)
print('Decision Tree recall:', dt_recall)
print('Decision Tree f1 score:', dt_f1_score)


Logistic Regression accuracy: 0.8857142857142857
Logistic Regression precision: 0.8723404255319149
Logistic Regression recall: 0.9534883720930233
Logistic Regression f1 score: 0.9111111111111112
SVM accuracy: 0.8714285714285714
SVM precision: 0.8695652173913043
SVM recall: 0.9302325581395349
SVM f1 score: 0.898876404494382
Decision Tree accuracy: 0.7142857142857143
Decision Tree precision: 0.9259259259259259
Decision Tree recall: 0.5813953488372093
Decision Tree f1 score: 0.7142857142857142


Overall, the results you got are pretty good. The Logistic Regression model seems to perform the best with an accuracy of 0.885 and an F1 score of 0.911, indicating that it is able to achieve a good balance between precision and recall.

In [29]:
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)

gb_model = GradientBoostingClassifier(n_estimators=100)
gb_model.fit(X_train, y_train)

xgb_model = xgb.XGBClassifier(n_estimators=100)
xgb_model.fit(X_train, y_train)


rf_preds = rf_model.predict(X_test)
gb_preds = gb_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)


rf_accuracy = accuracy_score(y_test, rf_preds)
rf_precision = precision_score(y_test, rf_preds)
rf_recall = recall_score(y_test, rf_preds)
rf_f1_score = f1_score(y_test, rf_preds)

gb_accuracy = accuracy_score(y_test, gb_preds)
gb_precision = precision_score(y_test, gb_preds)
gb_recall = recall_score(y_test, gb_preds)
gb_f1_score = f1_score(y_test, gb_preds)

xgb_accuracy = accuracy_score(y_test, xgb_preds)
xgb_precision = precision_score(y_test, xgb_preds)
xgb_recall = recall_score(y_test, xgb_preds)
xgb_f1_score = f1_score(y_test,xgb_preds)


print('Random Forest accuracy:', rf_accuracy)
print('Random Forest precision:', rf_precision)
print('Random Forest recall:', rf_recall)
print('Random Forest f1 score:', rf_f1_score)


print('Gradient Boosting accuracy:', gb_accuracy)
print('Gradient Boosting precision:', gb_precision)
print('Gradient Boosting recall:', gb_recall)
print('Gradient Boosting f1 score:', gb_f1_score)

print('XGB accuracy:', xgb_accuracy)
print('XGB precision:', xgb_precision)
print('XGB recall:', xgb_recall)
print('XGB f1 score:', xgb_f1_score)

Random Forest accuracy: 0.8857142857142857
Random Forest precision: 0.8723404255319149
Random Forest recall: 0.9534883720930233
Random Forest f1 score: 0.9111111111111112
Gradient Boosting accuracy: 0.8571428571428571
Gradient Boosting precision: 0.851063829787234
Gradient Boosting recall: 0.9302325581395349
Gradient Boosting f1 score: 0.888888888888889
XGB accuracy: 0.8714285714285714
XGB precision: 0.84
XGB recall: 0.9767441860465116
XGB f1 score: 0.9032258064516129


It seems like the Random Forest model is performing the best in terms of accuracy, precision, recall, and f1 score. However, the XGBoost model has the highest recall score, indicating that it correctly identifies most of the spam comments.