In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

In [3]:
train_data = pd.read_csv("/kaggle/input/ml-olympiad-toxic-language-ptbr-detection/train (2).csv")

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16800 entries, 0 to 16799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16800 non-null  object
 1   label   16800 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 262.6+ KB


In [5]:
train_data.head()

Unnamed: 0,text,label
0,"rt @user olha quem chegouuuuu, nossos queridin...",0
1,veio umas teorias muito loucas na minha cabeça...,1
2,@user @user 😂😂😂😂mais nao tinha falado ontem qu...,0
3,rt @user quer ser filha da puta logo comigo qu...,1
4,vai besta 😂😂😂😂 casquei com a ultima foto,1


In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

In [7]:
def clean_text(text):
    # Remove non-alphanumeric characters including emojis
    text = re.sub('[^A-Za-zÀ-ÖØ-öø-ÿ]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text, language='portuguese')
    # Remove stopwords
    stop_words = set(stopwords.words('portuguese'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = SnowballStemmer('portuguese')
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

train_data['cleaned_text'] = train_data['text'].apply(clean_text)

In [8]:
train_data.head()

Unnamed: 0,text,label,cleaned_text
0,"rt @user olha quem chegouuuuu, nossos queridin...",0,rt user olha chegouuuuu queridinh vem direçã f...
1,veio umas teorias muito loucas na minha cabeça...,1,vei umas teor louc cabec agor pqp to assust
2,@user @user 😂😂😂😂mais nao tinha falado ontem qu...,0,user user nao fal ontem ia patrocin nad pud vi...
3,rt @user quer ser filha da puta logo comigo qu...,1,rt user quer filh put log comig x pior kkkkkkk...
4,vai besta 😂😂😂😂 casquei com a ultima foto,1,vai best casqu ultim fot


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data['cleaned_text'], train_data['label'], test_size=0.2, random_state=42)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=500)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [12]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    algorithms = {
        "Logistic Regression": LogisticRegression(),
        "Random Forest": RandomForestClassifier(),
        "Support Vector Machine": SVC(),
        "XGBoost": XGBClassifier()
    }

    for name, model in algorithms.items():
        print("Training", name)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        report = classification_report(y_test, predictions)
        print("Accuracy:", accuracy)
        print("Classification Report:")
        print(report)
        print("-" * 50)

train_and_evaluate(X_train_tfidf, X_test_tfidf, y_train, y_test)

Training Logistic Regression
Accuracy: 0.7380952380952381
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.84      0.78      1881
           1       0.75      0.61      0.67      1479

    accuracy                           0.74      3360
   macro avg       0.74      0.72      0.73      3360
weighted avg       0.74      0.74      0.73      3360

--------------------------------------------------
Training Random Forest
Accuracy: 0.7357142857142858
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.79      0.77      1881
           1       0.71      0.67      0.69      1479

    accuracy                           0.74      3360
   macro avg       0.73      0.73      0.73      3360
weighted avg       0.73      0.74      0.73      3360

--------------------------------------------------
Training Support Vector Machine
Accuracy: 0.7446428571428572
Classification Report:
    

In [13]:
from sklearn.model_selection import cross_val_score
from skopt import BayesSearchCV

params = {
    'learning_rate': (0.01, 1.0, 'log-uniform'),
    'min_child_weight': (0, 10),
    'max_depth': (1, 50),
    'gamma': (0.0, 1.0, 'uniform'),
    'subsample': (0.1, 1.0, 'uniform'),
    'colsample_bytree': (0.1, 1.0, 'uniform'),
    'n_estimators': (50, 200),
}

xgb = XGBClassifier()

opt = BayesSearchCV(xgb, params, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1)

opt.fit(X_train_tfidf, y_train)

print("Best hyperparameters found: ", opt.best_params_)

best_model = opt.best_estimator_
best_model.fit(X_train_tfidf, y_train)
predictions = best_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Best hyperparameters found:  OrderedDict([('colsample_bytree', 1.0), ('gamma', 0.0), ('learning_rate', 1.0), ('max_depth', 1), ('min_child_weight', 0), ('n_estimators', 50), ('subsample', 1.0)])
Accuracy: 0.7425595238095238
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.81      0.78      1881
           1       0.73      0.66      0.69      1479

    accuracy                           0.74      3360
   macro avg       0.74      0.73      0.74      3360
weighted avg       0.74      0.74      0.74      3360

