# Preprocessing of Dataset

### Installing Dependencies

In [1]:
%pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
%pip install pickle

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement pickle (from versions: none)
ERROR: No matching distribution found for pickle

[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


## Import the dependencies

Import pandas for manipulating the dataset

In [3]:
import pandas as pd

## Read the csv datas

In [4]:
vocab_raw_data = pd.read_csv("dataset/words_rows.csv")

In [5]:
phrases_raw_data = pd.read_csv("dataset/phrases_rows.csv")

#### Removes other unnecesary columns

In [6]:
vocab = vocab_raw_data.drop(["id","created_at", "deleted_at","updated_at", "word_contribution_id","definition","examples","part_of_speech"], axis=1)

In [7]:
phrases = phrases_raw_data.drop(["id","created_at", "deleted_at","updated_at", "phrase_contribution_id"], axis=1)

#### Making the word 1 to 1 in the column

This section should be executed only once.

In [8]:
vocab["swardspeak_words"][0].replace("[", "").replace("]", "").replace('"', "").rsplit(",")

['Fez', 'Peslock', 'Pesang']

In [9]:
vocab["swardspeak_words"]=vocab["swardspeak_words"].apply(lambda row: row.replace("[", "").replace("]", "").replace('"', "").split(","))

In [10]:
vocab["translated_words"]=vocab["translated_words"].apply(lambda row: row.replace("[", "").replace("]", "").replace('"', "").split(","))

In [11]:
vocab.head(3)

Unnamed: 0,swardspeak_words,translated_words
0,"[Fez, Peslock, Pesang]",[Mukha]
1,[Bongga],[Enggrande]
2,"[Bugarou, Yosi]",[Sigarilyo]


In [12]:
phrases.head(3)

Unnamed: 0,swardspeak_phrase,translated_phrase
0,Holabels! May chova akis!,Hello! May chismis ako!
1,Hindi kami maka-gorabels ditey! Jumujulanis Mo...,Hindi kami makaalis dito! Umuulan nang malakas.
2,Nalotlot ang mga bagets dahil sila ay mga ate ...,Natalo ang mga bata dahil sila ay mga pangit a...


### Exploding the listed words

In [13]:
vocab = vocab.explode("swardspeak_words")

In [14]:
vocab = vocab.explode("translated_words")

In [15]:
vocab.head(10)

Unnamed: 0,swardspeak_words,translated_words
0,Fez,Mukha
0,Peslock,Mukha
0,Pesang,Mukha
1,Bongga,Enggrande
2,Bugarou,Sigarilyo
2,Yosi,Sigarilyo
3,Check,Okay
4,Chika,Tsismis
4,Chismaks,Tsismis
4,Chova,Tsismis


## Combining the words and phrases in one DataFrame

In [16]:
vocab.rename(columns = {'swardspeak_words':'input', 'translated_words':'output'}, inplace=True)

In [17]:
phrases.rename(columns= {'swardspeak_phrase':'input', 'translated_phrase':'output'}, inplace=True)

In [18]:
vocab.head()

Unnamed: 0,input,output
0,Fez,Mukha
0,Peslock,Mukha
0,Pesang,Mukha
1,Bongga,Enggrande
2,Bugarou,Sigarilyo


In [19]:
phrases.tail()

Unnamed: 0,input,output
90,Pagkatapos nya maglangoy ay Miss Nigeria na sya,Pagkatapos nya maglangoy ay negra na sya
91,"Oprah Winfrey, wiz na aketch magmomorayta","Pangako, hindi na ako magmumura"
92,"Quality Control ka naman, Pranella ka lang!","Maganda ka naman, praning ka lang!"
93,Nag-crayola si Shokla nung naJulie Yap syang n...,Umiyak si bakla nung nahuli syang nagloloko
94,"Alicia Mayer na tayo, andyan na ang Drivam ng ...","Aalis na tayo, andyan na ang driver ng ating s..."


In [20]:
combined = pd.concat([vocab, phrases])

In [21]:
combined.reset_index(inplace=True, drop=True)

In [22]:
combined.head()

Unnamed: 0,input,output
0,Fez,Mukha
1,Peslock,Mukha
2,Pesang,Mukha
3,Bongga,Enggrande
4,Bugarou,Sigarilyo


In [23]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   652 non-null    object
 1   output  652 non-null    object
dtypes: object(2)
memory usage: 10.3+ KB


In [24]:
combined.sample(frac=1)

Unnamed: 0,input,output
593,uso ang Ismelanie Marques na putok ngayong pan...,Uso ang amoy putok ngayong panahon ng tag-init
261,Ate Chona,Maattitude
59,Jotabelz,Bata
400,Awra,Umani ng atensyon
610,Ang lakas ng Julanis,Ang lakas ng ulan
...,...,...
105,Tomo,Tama
152,Wafu,Gwapo
492,Chikadora,Tsismosa
548,Gora,Alis


### Adding column for instruction

In [25]:
combined['instruction'] = "Translate the following words or phrases from Swardspeak to Tagalog:"

In [26]:
combined.head(2)

Unnamed: 0,input,output,instruction
0,Fez,Mukha,Translate the following words or phrases from ...
1,Peslock,Mukha,Translate the following words or phrases from ...


In [27]:
combined_lowercased = combined.copy()

combined_lowercased['input'] = combined_lowercased['input'].str.lower()
combined_lowercased['output'] = combined_lowercased['output'].str.lower()


In [28]:
combined_reversed = combined.copy()

In [29]:
combined_reversed['input'], combined_reversed['output'] = combined_reversed['output'], combined_reversed['input']

In [30]:
combined_reversed_lowercased = combined_reversed.copy()

combined_reversed_lowercased['input'] = combined_reversed_lowercased['input'].str.lower()
combined_reversed_lowercased['output'] = combined_reversed_lowercased['output'].str.lower()

In [31]:
combined_reversed['instruction'] = "Translate the following words or phrases from Tagalog to Swardspeak:"

In [32]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   input        652 non-null    object
 1   output       652 non-null    object
 2   instruction  652 non-null    object
dtypes: object(3)
memory usage: 15.4+ KB


In [33]:
combined_reversed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   input        652 non-null    object
 1   output       652 non-null    object
 2   instruction  652 non-null    object
dtypes: object(3)
memory usage: 15.4+ KB


In [34]:
dataset = pd.concat([combined, combined_lowercased, combined_reversed,  combined_reversed_lowercased])

In [35]:
dataset.reset_index(inplace=True, drop=True)

In [36]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2608 entries, 0 to 2607
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   input        2608 non-null   object
 1   output       2608 non-null   object
 2   instruction  2608 non-null   object
dtypes: object(3)
memory usage: 61.3+ KB


In [45]:
dataset = dataset.sample(frac=1)

In [47]:
dataset.reset_index(inplace=True, drop=True)

In [48]:
dataset.tail()

Unnamed: 0,input,output,instruction
2603,buntis,jontis,Translate the following words or phrases from ...
2604,chikadora,tsismosa,Translate the following words or phrases from ...
2605,Splok,Magsalita,Translate the following words or phrases from ...
2606,Kaloka,Kalurkey,Translate the following words or phrases from ...
2607,insecure,insekyora,Translate the following words or phrases from ...


In [None]:
dataset.to_json("dataset/swardspeak_tagalog_dataset.json", orient="records")

## Splitting the Dataset

In [49]:
from sklearn.model_selection import train_test_split

In [52]:
train, test = train_test_split(dataset, test_size=0.1)
_, validation = train_test_split(train, test_size=0.1)

In [53]:
train.to_json("dataset/train.json", orient="records")
test.to_json("dataset/test.json", orient="records")
validation.to_json("dataset/validation.json", orient="records")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


def create_dataset():
    vocab_raw_data = pd.read_csv("dataset/words_rows.csv")
    phrases_raw_data = pd.read_csv("dataset/phrases_rows.csv")

    # Dropping unnecessary columns
    vocab = vocab_raw_data.drop(["id","created_at", "deleted_at","updated_at", "word_contribution_id","definition","examples","part_of_speech"], axis=1)
    phrases = phrases_raw_data.drop(["id","created_at", "deleted_at","updated_at", "phrase_contribution_id"], axis=1)

    # Splitting the words and phrases
    vocab["swardspeak_words"]=vocab["swardspeak_words"].apply(lambda row: row.replace("[", "").replace("]", "").replace('"', "").split(","))
    vocab["translated_words"]=vocab["translated_words"].apply(lambda row: row.replace("[", "").replace("]", "").replace('"', "").split(","))

    # Exploding the words and phrases
    vocab = vocab.explode("swardspeak_words")
    vocab = vocab.explode("translated_words")

    # Renaming the columns
    vocab.rename(columns = {'swardspeak_words':'input', 'translated_words':'output'}, inplace=True)
    phrases.rename(columns= {'swardspeak_phrase':'input', 'translated_phrase':'output'}, inplace=True)

    # Combining the vocab and phrases
    combined = pd.concat([vocab, phrases])
    # combined.reset_index(inplace=True, drop=True)

    # Adding instruction column
    combined['instruction'] = "Translate the following words or phrases from Swardspeak to Tagalog:"
    # Creating a lowercased version of the dataset
    combined_lowercased = combined.copy()
    combined_lowercased['input'] = combined_lowercased['input'].str.lower()
    combined_lowercased['output'] = combined_lowercased['output'].str.lower()
    
    # Then reversing the input and output columns
    # For the reversed dataset
    combined_reversed = combined.copy()
    combined_reversed['input'], combined_reversed['output'] = combined_reversed['output'], combined_reversed['input']
    
    # Creating a lowercased version of the reversed dataset
    combined_reversed_lowercased = combined_reversed.copy()
    combined_reversed_lowercased['input'] = combined_reversed_lowercased['input'].str.lower()
    combined_reversed_lowercased['output'] = combined_reversed_lowercased['output'].str.lower()
    # Finally, adding the instruction column for the reversed dataset
    combined_reversed['instruction'] = "Translate the following words or phrases from Tagalog to Swardspeak:"
    # Combining all the datasets
    dataset = pd.concat([combined, combined_lowercased, combined_reversed,  combined_reversed_lowercased])
    # dataset.reset_index(inplace=True, drop=True)

    dataset = dataset.sample(frac=1)
    dataset.reset_index(inplace=True, drop=True)

    # Splitting the dataset into train, test, and validation sets
    train, test = train_test_split(dataset, test_size=0.1)
    _, validation = train_test_split(train, test_size=0.1)

    return train, test, validation