In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tanishqdublish/text-classification-documentation")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'text-classification-documentation' dataset.
Path to dataset files: /kaggle/input/text-classification-documentation


In [2]:
import os
print(os.listdir(path))

['df_file.csv']


In [3]:
import pandas as pd

df = pd.read_csv(path + "/df_file.csv")
df.head()

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0


In [4]:
df.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,511
4,510
0,417
2,401
3,386


In [5]:
print(df.shape)

(2225, 2)


In [6]:
df.dropna(inplace=True)
df.shape

(2225, 2)

# Pré-processamento de dados

In [7]:
import re

text = "  VIKI's | Bookcase/Bookshelf (3-Shelf/Shelve, White) | ? . hi"
text = re.sub(r'[^\w\s\']',' ', text)
text = re.sub(' +', ' ', text)
text.strip().lower()

"viki's bookcase bookshelf 3 shelf shelve white hi"

In [41]:
!pip install nltk



In [42]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [43]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

In [44]:
def preprocess(text):
    # remove caracteres especiais
    text = re.sub(r'[^\w\s]', ' ', text)

    # lowercase
    text = text.lower()

    # remove espaços extras
    text = re.sub(r'\s+', ' ', text).strip()

    # remove stop words
    tokens = [word for word in text.split() if word not in stop_words]

    return " ".join(tokens)

In [45]:
df['Text'] = df['Text'].map(preprocess)
df.head()

Unnamed: 0,Text,Label
0,budget set scene election gordon brown seek pu...,0
1,army chiefs regiments decision military chiefs...,0
2,howard denies split id cards michael howard de...,0
3,observers monitor uk election ministers invite...,0
4,kilroy names election seat target ex chat show...,0


In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Text, df.Label, test_size=0.2)

In [11]:
!pip install fasttext



In [47]:
def save_fasttext_format(texts, labels, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for text, label in zip(texts, labels):
            f.write(f"__label__{label} {text}\n")

In [48]:
save_fasttext_format(X_train, y_train, "train.txt")
save_fasttext_format(X_test, y_test, "test.txt")


In [49]:
import fasttext

model = fasttext.train_supervised(
    input="train.txt",
    epoch=25,
    lr=0.3,
    wordNgrams=2,
    dim=200,
    minCount=2
)


In [50]:
result = model.test("test.txt")
print(f"Samples: {result[0]}")
print(f"Precision: {result[1]}")
print(f"Recall: {result[2]}")


Samples: 445
Precision: 0.9707865168539326
Recall: 0.9707865168539326


In [52]:
model.predict("the team secured a victory after scoring a goal in the final minutes", k=3)


(('__label__1', '__label__3', '__label__4'),
 array([1.00000322e+00, 1.68015158e-05, 1.00543666e-05]))

In [53]:
model.predict("parliament voted today on a controversial bill proposed by the prime minister", k=3)

(('__label__0', '__label__4', '__label__2'),
 array([9.98378336e-01, 1.45251304e-03, 1.03891551e-04]))