In [1]:
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')

nlp = spacy.load("en_core_web_trf")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
filipino_stopwords = [
    "a", "ako", "ang", "ano", "at", "ay", "ibang", "ito", "iyon", "ka",
    "kami", "kanila", "kanya", "kayo", "laki", "mga", "na", "ng", "ni",
    "nito", "nang", "sa", "sila", "tayo", "walang", "yung", "si", "bawat",
    "kung", "hindi", "para", "dahil", "doon", "baka", "kapag", "saan",
    "sino", "siya", "tama", "yan", "o", "pala", "pero", "wala", "huwag",
    "muna", "na", "naman", "pag", "sana", "tulad", "upang", "bago", 
    "dati", "iba", "madami", "nakita", "pagkatapos", "pati", "sabi", "sana"
]

In [3]:
def load_dataset(file_path):
    try:
        data = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
        return data
    except FileNotFoundError:
        print("File not found. Please check the file path.")
    except pd.errors.EmptyDataError:
        print("File is empty. Please check the file content.")
    except pd.errors.ParserError:
        print("Error parsing file. Please check the file format.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [4]:
file_path = '../data/sample_dataset.csv'
data = load_dataset(file_path)

sentences = data['sentence'].tolist()

Dataset loaded successfully!


In [5]:
def format_list_as_string(token_list):
    return str(token_list).replace("'", '"')

In [6]:
def print_table(data, title="Table", num_samples=5):
    from rich.console import Console
    from rich.table import Table
    
    table = Table(title=title)
    
    for col in data.columns:
        table.add_column(col)

    for _, row in data.head(num_samples).iterrows():
        formatted_row = [format_list_as_string(row[col]) if isinstance(row[col], list) else row[col] for col in data.columns]
        table.add_row(*map(str, formatted_row))
    
    console = Console()
    console.print(table)

print_table(data, title="Original Data")

In [7]:
def convert_to_lowercase(data, file_path='../data/lowercased_data.csv'):
    if 'sentence' in data.columns:
        data['sentence'] = data['sentence'].str.lower()
        
        data.to_csv(file_path, index=False)
        
        print("Sentence has been converted to lowercase and saved to:", file_path)
    else:
        print("Column 'sentence' not found in the DataFrame.")

convert_to_lowercase(data)
print_table(data, title="Data After Lowercase Conversion")

Sentence has been converted to lowercase and saved to: ../data/lowercased_data.csv


In [8]:
def remove_punctuation(data):
    if 'sentence' in data.columns:
        data['sentence'] = data['sentence'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        print("Punctuation has been removed.")
    else:
        print("Column 'sentence' not found in the DataFrame.")

remove_punctuation(data)
print_table(data, title="Data After Punctuation Removal")



Punctuation has been removed.


In [9]:
def remove_numbers(data):
    if 'sentence' in data.columns:
        data['sentence'] = data['sentence'].str.replace(r'\d+', '', regex=True)
        print("Numbers have been removed.")
    else:
        print("Column 'sentence' not found in the DataFrame.")\

remove_numbers(data)
print_table(data, title="Data After Numbers Removal")


Numbers have been removed.


In [10]:
def tokenize_sentences(data):
    if 'sentence' in data.columns:
        data['sentence'] = data['sentence'].apply(lambda x: word_tokenize(x))
        print("Sentences have been tokenized.")
    else:
        print("Column 'sentence' not found in the DataFrame.")

tokenize_sentences(data)
print_table(data, title="Data After Tokenization")

Sentences have been tokenized.


In [11]:
def remove_stopwords(data):
    if 'sentence' in data.columns:
        english_stopwords = set(stopwords.words('english'))
        all_stopwords = english_stopwords.union(set(filipino_stopwords))
        
        data['sentence'] = data['sentence'].apply(lambda tokens: [word for word in tokens if word.lower() not in all_stopwords])
        print("Stopwords have been removed.")
    else:
        print("Column 'sentence' not found in the DataFrame.")

remove_stopwords(data)
print_table(data, title="Data After Stopwords Removal")

Stopwords have been removed.


In [12]:
def lemmatize_tokens(data):
    if 'sentence' in data.columns:
        data['sentence'] = data['sentence'].apply(lambda tokens: [nlp(token)[0].lemma_ for token in tokens])
        print("Tokens have been lemmatized.")
    else:
        print("Column 'sentence' not found in the DataFrame.")

lemmatize_tokens(data)
print_table(data, title="Data After Lemmatization")

Tokens have been lemmatized.


In [13]:
def join_tokens(data):
    if 'sentence' in data.columns:
        data['sentence'] = data['sentence'].apply(lambda tokens: ' '.join(tokens))
        print("Tokens have been joined back into sentences.")
    else:
        print("Column 'sentence' not found in the DataFrame.")

join_tokens(data)
print_table(data, title="Data After Joining Tokens")

Tokens have been joined back into sentences.


In [14]:
def vectorize_with_tfidf(data):
    if 'sentence' in data.columns:
        vectorizer = TfidfVectorizer()

        tfidf_matrix = vectorizer.fit_transform(data['sentence'])

        print("TF-IDF Vectorization complete.")
        return tfidf_matrix, vectorizer
    else:
        print("Column 'sentence' not found in the DataFrame.")

tfidf_matrix, vectorizer = vectorize_with_tfidf(data)
feature_names = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_df['emotion'] = data['emotion'].values

print(tfidf_df.head())

TF-IDF Vectorization complete.
   already  balak  complete  finish        go  homework    intend     kang  \
0  0.00000    0.0   0.00000     0.5  0.000000   0.00000  0.000000  0.00000   
1  0.00000    0.0   0.00000     0.0  0.000000   0.00000  0.000000  0.57735   
2  0.00000    0.0   0.00000     0.0  0.447214   0.00000  0.447214  0.00000   
3  0.57735    0.0   0.57735     0.0  0.000000   0.57735  0.000000  0.00000   
4  0.00000    0.0   0.00000     0.0  0.000000   0.00000  0.000000  0.00000   

   kanyang   ko  ...  puntahan  susunod  syang  takdangaralin  tapo  time  \
0      0.0  0.0  ...       0.0      0.0    0.0            0.0   0.0   0.5   
1      0.0  0.0  ...       0.0      0.0    0.0            0.0   0.0   0.0   
2      0.0  0.0  ...       0.0      0.0    0.0            0.0   0.0   0.0   
3      0.0  0.0  ...       0.0      0.0    0.0            0.0   0.0   0.0   
4      0.0  0.5  ...       0.5      0.0    0.5            0.0   0.0   0.0   

    umalis  vacation      week  emoti