RF vs SVM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (replace 'your_dataset.csv' with your actual file path)
file_path = '/content/ds3.csv'
df = pd.read_csv(file_path)

# Assuming your DataFrame has columns 'comment_text' and 'toxic'
X = df['comment_text'].astype(str)
y = df['toxic']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train_tfidf, y_train)
rf_predictions = random_forest.predict(X_test_tfidf)

# SVM Classifier
svm_classifier = SVC(kernel='linear', C=0.9)
svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions = svm_classifier.predict(X_test_tfidf)

# Compare accuracies
rf_accuracy = accuracy_score(y_test, rf_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("SVM Accuracy:", svm_accuracy)

# You can also print classification reports for more detailed metrics
print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_predictions))
print("\nSVM Classification Report:\n", classification_report(y_test, svm_predictions))

Random Forest Accuracy: 0.9111111111111111
SVM Accuracy: 0.9555555555555556

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.84      0.89        19
           1       0.89      0.96      0.93        26

    accuracy                           0.91        45
   macro avg       0.92      0.90      0.91        45
weighted avg       0.91      0.91      0.91        45


SVM Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95        19
           1       0.96      0.96      0.96        26

    accuracy                           0.96        45
   macro avg       0.95      0.95      0.95        45
weighted avg       0.96      0.96      0.96        45



SVM vs RF

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (replace 'your_dataset.csv' with your actual file path)
file_path = '/content/dataset1.csv'
df = pd.read_csv(file_path)

# Assuming your DataFrame has columns 'comment_text' and 'toxic'
X = df['comment_text'].astype(str)
y = df['toxic']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train_tfidf, y_train)
rf_predictions = random_forest.predict(X_test_tfidf)

# SVM Classifier
svm_classifier = SVC(kernel='linear', C=0.9)
svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions = svm_classifier.predict(X_test_tfidf)

# Compare accuracies
rf_accuracy = accuracy_score(y_test, rf_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("SVM Accuracy:", svm_accuracy)
print("Random Forest Accuracy:", rf_accuracy)

# You can also print classification reports for more detailed metrics
print("\nSVM Classification Report:\n", classification_report(y_test, svm_predictions))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_predictions))

SVM Accuracy: 0.85
Random Forest Accuracy: 0.85

SVM Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       0.84      1.00      0.91        16

    accuracy                           0.85        20
   macro avg       0.92      0.62      0.66        20
weighted avg       0.87      0.85      0.81        20


Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       0.84      1.00      0.91        16

    accuracy                           0.85        20
   macro avg       0.92      0.62      0.66        20
weighted avg       0.87      0.85      0.81        20



.............................................

RF vs DT

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (replace 'your_dataset.csv' with your actual file path)
file_path = '/content/ds4.csv'
df = pd.read_csv(file_path)

# Assuming your DataFrame has columns 'comment_text' and 'toxic'
X = df['comment_text'].astype(str)
y = df['toxic']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train_tfidf, y_train)
rf_predictions = random_forest.predict(X_test_tfidf)

# Decision Tree Classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_tfidf, y_train)
dt_predictions = decision_tree.predict(X_test_tfidf)

# Compare accuracies
rf_accuracy = accuracy_score(y_test, rf_predictions)
dt_accuracy = accuracy_score(y_test, dt_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Decision Tree Accuracy:", dt_accuracy)

# You can also print classification reports for more detailed metrics
print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_predictions))
print("\nDecision Tree Classification Report:\n", classification_report(y_test, dt_predictions))

Random Forest Accuracy: 0.9
Decision Tree Accuracy: 0.8333333333333334

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.68      0.81        19
           1       0.87      1.00      0.93        41

    accuracy                           0.90        60
   macro avg       0.94      0.84      0.87        60
weighted avg       0.91      0.90      0.89        60


Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.79      0.75        19
           1       0.90      0.85      0.88        41

    accuracy                           0.83        60
   macro avg       0.81      0.82      0.81        60
weighted avg       0.84      0.83      0.84        60



SVM vs DT

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (replace 'your_dataset.csv' with your actual file path)
file_path = '/content/dataset1.csv'
df = pd.read_csv(file_path)

# Assuming your DataFrame has columns 'comment_text' and 'toxic'
X = df['comment_text'].astype(str)
y = df['toxic']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# SVM Classifier
svm_classifier = SVC(kernel='linear', C=1.0)
svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions = svm_classifier.predict(X_test_tfidf)

# Decision Tree Classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_tfidf, y_train)
dt_predictions = decision_tree.predict(X_test_tfidf)

# Compare accuracies
svm_accuracy = accuracy_score(y_test, svm_predictions)
dt_accuracy = accuracy_score(y_test, dt_predictions)

print("SVM Accuracy:", svm_accuracy)
print("Decision Tree Accuracy:", dt_accuracy)

# You can also print classification reports for more detailed metrics
print("\nSVM Classification Report:\n", classification_report(y_test, svm_predictions))
print("\nDecision Tree Classification Report:\n", classification_report(y_test, dt_predictions))

SVM Accuracy: 0.85
Decision Tree Accuracy: 0.9

SVM Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       0.84      1.00      0.91        16

    accuracy                           0.85        20
   macro avg       0.92      0.62      0.66        20
weighted avg       0.87      0.85      0.81        20


Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.89      1.00      0.94        16

    accuracy                           0.90        20
   macro avg       0.94      0.75      0.80        20
weighted avg       0.91      0.90      0.89        20



.....................................

RF vs K-M

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (replace 'your_dataset.csv' with your actual file path)
file_path = '/content/ds3.csv'
df = pd.read_csv(file_path)

# Assuming your DataFrame has columns 'comment_text' and 'toxic'
X = df['comment_text'].astype(str)
y = df['toxic']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train_tfidf, y_train)
rf_predictions = random_forest.predict(X_test_tfidf)

# K-Means Clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_train_tfidf)
km_predictions = kmeans.predict(X_test_tfidf)

# Convert cluster labels to binary labels for comparison
km_predictions_binary = [1 if label == 1 else 0 for label in km_predictions]

# Compare accuracies
rf_accuracy = accuracy_score(y_test, rf_predictions)
km_accuracy = accuracy_score(y_test, km_predictions_binary)

print("Random Forest Accuracy:", rf_accuracy)
print("K-Means Accuracy:", km_accuracy)

# You can also print classification reports for more detailed metrics
print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_predictions))
print("\nK-Means Classification Report:\n", classification_report(y_test, km_predictions_binary))

Random Forest Accuracy: 0.9111111111111111
K-Means Accuracy: 0.15555555555555556

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.84      0.89        19
           1       0.89      0.96      0.93        26

    accuracy                           0.91        45
   macro avg       0.92      0.90      0.91        45
weighted avg       0.91      0.91      0.91        45


K-Means Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.27      0.27      0.27        26

    accuracy                           0.16        45
   macro avg       0.13      0.13      0.13        45
weighted avg       0.16      0.16      0.16        45





...................................///.................................

RF vs XGboost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (replace 'your_dataset.csv' with your actual file path)
file_path = '/content/ds4.csv'
df = pd.read_csv(file_path)

# Assuming your DataFrame has columns 'comment_text' and 'toxic'
X = df['comment_text'].astype(str)
y = df['toxic']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train_tfidf, y_train)
rf_predictions = random_forest.predict(X_test_tfidf)

# XGBoost Classifier
xgb_classifier = XGBClassifier(learning_rate=0.1, n_estimators=100, random_state=42)
xgb_classifier.fit(X_train_tfidf, y_train)
xgb_predictions = xgb_classifier.predict(X_test_tfidf)

# Compare accuracies
rf_accuracy = accuracy_score(y_test, rf_predictions)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("XGBoost Accuracy:", xgb_accuracy)

# You can also print classification reports for more detailed metrics
print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_predictions))
print("\nXGBoost Classification Report:\n", classification_report(y_test, xgb_predictions))

Random Forest Accuracy: 0.9
XGBoost Accuracy: 0.8166666666666667

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.68      0.81        19
           1       0.87      1.00      0.93        41

    accuracy                           0.90        60
   macro avg       0.94      0.84      0.87        60
weighted avg       0.91      0.90      0.89        60


XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.63      0.69        19
           1       0.84      0.90      0.87        41

    accuracy                           0.82        60
   macro avg       0.80      0.77      0.78        60
weighted avg       0.81      0.82      0.81        60



SVM vs XGboost

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (replace 'your_dataset.csv' with your actual file path)
file_path = '/content/dataset1.csv'
df = pd.read_csv(file_path)

# Assuming your DataFrame has columns 'comment_text' and 'toxic'
X = df['comment_text'].astype(str)
y = df['toxic']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# SVM Classifier
svm_classifier = SVC(kernel='linear', C=1.0)
svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions = svm_classifier.predict(X_test_tfidf)

# XGBoost Classifier
xgb_classifier = XGBClassifier(learning_rate=0.1, n_estimators=100, random_state=42)
xgb_classifier.fit(X_train_tfidf, y_train)
xgb_predictions = xgb_classifier.predict(X_test_tfidf)

# Compare accuracies
svm_accuracy = accuracy_score(y_test, svm_predictions)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)

print("SVM Accuracy:", svm_accuracy)
print("XGBoost Accuracy:", xgb_accuracy)

# You can also print classification reports for more detailed metrics
print("\nSVM Classification Report:\n", classification_report(y_test, svm_predictions))
print("\nXGBoost Classification Report:\n", classification_report(y_test, xgb_predictions))

SVM Accuracy: 0.85
XGBoost Accuracy: 0.85

SVM Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       0.84      1.00      0.91        16

    accuracy                           0.85        20
   macro avg       0.92      0.62      0.66        20
weighted avg       0.87      0.85      0.81        20


XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       0.84      1.00      0.91        16

    accuracy                           0.85        20
   macro avg       0.92      0.62      0.66        20
weighted avg       0.87      0.85      0.81        20



.......................................................

RF vs NB

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (replace 'your_dataset.csv' with your actual file path)
file_path = '/content/ds4.csv'
df = pd.read_csv(file_path)

# Assuming your DataFrame has columns 'comment_text' and 'toxic'
X = df['comment_text'].astype(str)
y = df['toxic']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train_tfidf, y_train)
rf_predictions = random_forest.predict(X_test_tfidf)

# Naive Bayes Classifier (Multinomial Naive Bayes is commonly used for text classification)
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)
nb_predictions = naive_bayes.predict(X_test_tfidf)

# Compare accuracies
rf_accuracy = accuracy_score(y_test, rf_predictions)
nb_accuracy = accuracy_score(y_test, nb_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Naive Bayes Accuracy:", nb_accuracy)

# You can also print classification reports for more detailed metrics
print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_predictions))
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))

Random Forest Accuracy: 0.9
Naive Bayes Accuracy: 0.9

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.68      0.81        19
           1       0.87      1.00      0.93        41

    accuracy                           0.90        60
   macro avg       0.94      0.84      0.87        60
weighted avg       0.91      0.90      0.89        60


Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.68      0.81        19
           1       0.87      1.00      0.93        41

    accuracy                           0.90        60
   macro avg       0.94      0.84      0.87        60
weighted avg       0.91      0.90      0.89        60



SVM vs NB

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (replace 'your_dataset.csv' with your actual file path)
file_path = '/content/dataset2.csv'
df = pd.read_csv(file_path)

# Assuming your DataFrame has columns 'comment_text' and 'toxic'
X = df['comment_text'].astype(str)
y = df['toxic']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# SVM Classifier
svm_classifier = SVC(kernel='linear', C=1.0)
svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions = svm_classifier.predict(X_test_tfidf)

# Naive Bayes Classifier (Multinomial Naive Bayes is commonly used for text classification)
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)
nb_predictions = naive_bayes.predict(X_test_tfidf)

# Compare accuracies
svm_accuracy = accuracy_score(y_test, svm_predictions)
nb_accuracy = accuracy_score(y_test, nb_predictions)

print("SVM Accuracy:", svm_accuracy)
print("Naive Bayes Accuracy:", nb_accuracy)

# You can also print classification reports for more detailed metrics
print("\nSVM Classification Report:\n", classification_report(y_test, svm_predictions))
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))

SVM Accuracy: 0.875
Naive Bayes Accuracy: 0.9

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.88      0.92      0.90        25

    accuracy                           0.88        40
   macro avg       0.87      0.86      0.86        40
weighted avg       0.87      0.88      0.87        40


Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.73      0.85        15
           1       0.86      1.00      0.93        25

    accuracy                           0.90        40
   macro avg       0.93      0.87      0.89        40
weighted avg       0.91      0.90      0.90        40



.....................................