In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

In [3]:
train_data = pd.read_csv("/kaggle/input/ml-olympiad-toxic-language-ptbr-detection/train (2).csv")

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16800 entries, 0 to 16799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16800 non-null  object
 1   label   16800 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 262.6+ KB


In [5]:
train_data.head()

Unnamed: 0,text,label
0,"rt @user olha quem chegouuuuu, nossos queridin...",0
1,veio umas teorias muito loucas na minha cabeça...,1
2,@user @user 😂😂😂😂mais nao tinha falado ontem qu...,0
3,rt @user quer ser filha da puta logo comigo qu...,1
4,vai besta 😂😂😂😂 casquei com a ultima foto,1


In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

In [7]:
def clean_text(text):
    # Remove non-alphanumeric characters including emojis
    text = re.sub('[^A-Za-zÀ-ÖØ-öø-ÿ]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text, language='portuguese')
    # Remove stopwords
    stop_words = set(stopwords.words('portuguese'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = SnowballStemmer('portuguese')
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

train_data['cleaned_text'] = train_data['text'].apply(clean_text)

In [8]:
train_data.head()

Unnamed: 0,text,label,cleaned_text
0,"rt @user olha quem chegouuuuu, nossos queridin...",0,rt user olha chegouuuuu queridinh vem direçã f...
1,veio umas teorias muito loucas na minha cabeça...,1,vei umas teor louc cabec agor pqp to assust
2,@user @user 😂😂😂😂mais nao tinha falado ontem qu...,0,user user nao fal ontem ia patrocin nad pud vi...
3,rt @user quer ser filha da puta logo comigo qu...,1,rt user quer filh put log comig x pior kkkkkkk...
4,vai besta 😂😂😂😂 casquei com a ultima foto,1,vai best casqu ultim fot


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data['cleaned_text'], train_data['label'], test_size=0.2, random_state=42)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=500)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [12]:
# def train_and_evaluate(X_train, X_test, y_train, y_test):
#     algorithms = {
#         "Logistic Regression": LogisticRegression(),
#         "Random Forest": RandomForestClassifier(),
#         "Support Vector Machine": SVC(),
#         "XGBoost": XGBClassifier()
#     }

#     for name, model in algorithms.items():
#         print("Training", name)
#         model.fit(X_train, y_train)
#         predictions = model.predict(X_test)
#         accuracy = accuracy_score(y_test, predictions)
#         report = classification_report(y_test, predictions)
#         print("Accuracy:", accuracy)
#         print("Classification Report:")
#         print(report)
#         print("-" * 50)

# train_and_evaluate(X_train_tfidf, X_test_tfidf, y_train, y_test)

In [13]:
from sklearn.ensemble import StackingClassifier
import lightgbm as lgbm

model = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42 , max_depth=8)),
        ('svc', SVC(probability=True, random_state=42)),
        ('lgbm' , lgbm.LGBMClassifier(n_estimators=100, max_depth=8 , random_state=42))
    ],
    final_estimator=RandomForestClassifier(random_state=43),
    cv=5  # number of cross-validation folds
)
model.fit(X_train_tfidf, y_train)

[LightGBM] [Info] Number of positive: 5896, number of negative: 7544
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036876 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23615
[LightGBM] [Info] Number of data points in the train set: 13440, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.438690 -> initscore=-0.246478
[LightGBM] [Info] Start training from score -0.246478
[LightGBM] [Info] Number of positive: 4717, number of negative: 6035
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19628
[LightGBM] [Info] Number of data points in the train set: 10752, number of used features: 500
[LightGBM] [Info] 

In [14]:
predictions = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.731845238095238
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.78      0.76      1881
           1       0.70      0.68      0.69      1479

    accuracy                           0.73      3360
   macro avg       0.73      0.73      0.73      3360
weighted avg       0.73      0.73      0.73      3360



In [15]:
test_data = pd.read_csv("/kaggle/input/ml-olympiad-toxic-language-ptbr-detection/test (4).csv")

In [16]:
test_data.head()

Unnamed: 0,id,text
0,0,@user nossa mano te odeio na moral kkkkkkkkkkk...
1,1,@user @user a edição ficou muito boa! kkkkkkkk...
2,2,"@user largada vá, visto que a vaca anda à solta 😂"
3,3,"poxa, eu queria ganhar um boné. alguém me da u..."
4,4,"@user amiga, tudo bem, não precisa se desculpa..."


In [17]:
def clean_text(text):
   
    text = re.sub('[^A-Za-zÀ-ÖØ-öø-ÿ]', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text, language='portuguese')
    stop_words = set(stopwords.words('portuguese'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = SnowballStemmer('portuguese')
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

test_data['cleaned_text'] = test_data['text'].apply(clean_text)

In [18]:
test_data.head()

Unnamed: 0,id,text,cleaned_text
0,0,@user nossa mano te odeio na moral kkkkkkkkkkk...,user man odei moral kkkkkkkkkkkkkkkkkkk
1,1,@user @user a edição ficou muito boa! kkkkkkkk...,user user ediçã fic boa kkkkkkkkkkk falt luk tim
2,2,"@user largada vá, visto que a vaca anda à solta 😂",user larg vá vist vac anda solt
3,3,"poxa, eu queria ganhar um boné. alguém me da u...",pox quer ganh bon alguém bon
4,4,"@user amiga, tudo bem, não precisa se desculpa...",user amig tud bem precis desculp escrot q ente...


In [19]:
test = tfidf_vectorizer.transform(test_data['cleaned_text'])

In [20]:
test_predictions = model.predict(test)



In [21]:
# Assuming 'id' column contains the unique identifiers for each sample in the test data
test_ids = test_data['id']

# Save predictions to a CSV file
submission_df = pd.DataFrame({'id': test_ids, 'label': test_predictions})
submission_df.to_csv('submission.csv', index=False)