# Latihan 1

### Importing the dependencies

In [1]:
import pandas as pd
import numpy as np
import re
import math
from collections import Counter
import tkinter as tk
import tkinter.font as font
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg


### Exploratory Data Analysis (EDA)

We'll load the data first

In [2]:
data = pd.read_csv('./datasets/spam.csv') # Loading the data
data.head() # Get a glimpse of the data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


There are 2 columns: `label` and `text`. The `label` indicates what's the `text` is classified as.

We'll try to get some information on the data

In [3]:
data.columns

Index(['label', 'text'], dtype='object')

In [4]:
print(f'Number of rows: {len(data)}')

Number of rows: 5572


Let's see if the data contains any missing values

In [5]:
data.isna().sum()

label    0
text     0
dtype: int64

Now let's see the variance of the `label` column

In [6]:
data['label'].unique()

array(['ham', 'spam'], dtype=object)

Now let's check the count of each labels

In [7]:
data['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

Now, let's tokenize the sentences

In [8]:
def word_tokenize(text: str):
    # Define regex pattern to match words
    pattern = r"\w+"
    # Use findall to extract words from the text using the regex pattern
    tokens = re.findall(pattern, text.casefold())
    return tokens

In [9]:
tokenized_sentences = [word_tokenize(doc) for doc in data['text'].to_list()] # OR 'documents'
tokenized_sentences

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'u', 'oni'],
 ['free',
  'entry',
  'in',
  '2',
  'a',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  'to',
  '87121',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  't',
  'c',
  's',
  'apply',
  '08452810075over18',
  's'],
 ['u', 'dun', 'say', 'so', 'early', 'hor', 'u', 'c', 'already', 'then', 'say'],
 ['nah',
  'i',
  'don',
  't',
  'think',
  'he',
  'goes',
  'to',
  'usf',
  'he',
  'lives',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  's',
  'been',
  '3',
  'week',
  's',
  'now',
  'and',
  'no',
  'word',
  'back',
  'i',
  'd',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',


Let's check the average length of each sentences

In [10]:
sentences_count = len(data)
words_count = 0

for i in tokenized_sentences:
    words_count += len(i)

print(f"Total number of words: {words_count}")
print(f'Average words count in each sentence: {round(words_count / sentences_count)} words')

Total number of words: 90106
Average words count in each sentence: 16 words


Now let's tokenize the words

In [11]:
tokens = []

for i in tokenized_sentences:
    for j in range(len(i)):
        tokens.append(i[j])

tokens = tuple(tokens)
print(tokens)



#### Conclusion
- The data contains 5572 rows
- The data are labeled either `ham` or `spam`
- There ara 4825 `ham` rows and 747 `spam` rows
- There are no missing values (data is clean)
- The average words count of each sentences is 16

### Word Weighting

Let's try to weigh on those words we got from previous step

Let's make a utility functions

In [12]:
def preprocess_text(text: str):
    tokens = word_tokenize(text)
    return tokens

In [13]:
def calculate_tf(word_list: list[str]):
    # Calculate the Term-Frequency (TF) for each word in the given document
    tf_dict = {}
    total_words = len(word_list)
    for word in word_list:
        tf_dict[word] = tf_dict.get(word, 0) + 1 / total_words
    
    return tf_dict

In [14]:
def calculate_idf(documents: list[list[str]]):
    # Calculate the inverse document frequency (IDF) for each words accross the given documents
    idf_dict = {}
    total_docs = len(documents)
    
    all_words = set([word for document in documents for word in document])
    
    for word in all_words:
        doc_count = sum([1 for document in documents if word in document])
        idf_dict[word] = math.log(total_docs / (1 + doc_count))
    
    return idf_dict

In [15]:
def calculate_tf_idf(tf: dict, idf: dict):
    # Calculate the TF-IDF values for each word in document
    tfidf_dict = {}
    for word in tf:
        tfidf_dict[word] = tf[word] * idf[word]
    
    return tfidf_dict

In [16]:
tfs = [calculate_tf(doc) for doc in tokenized_sentences]
tfs

[{'go': 0.05,
  'until': 0.05,
  'jurong': 0.05,
  'point': 0.05,
  'crazy': 0.05,
  'available': 0.05,
  'only': 0.05,
  'in': 0.05,
  'bugis': 0.05,
  'n': 0.05,
  'great': 0.05,
  'world': 0.05,
  'la': 0.05,
  'e': 0.05,
  'buffet': 0.05,
  'cine': 0.05,
  'there': 0.05,
  'got': 0.05,
  'amore': 0.05,
  'wat': 0.05},
 {'ok': 0.16666666666666666,
  'lar': 0.16666666666666666,
  'joking': 0.16666666666666666,
  'wif': 0.16666666666666666,
  'u': 0.16666666666666666,
  'oni': 0.16666666666666666},
 {'free': 0.030303030303030304,
  'entry': 0.06060606060606061,
  'in': 0.030303030303030304,
  '2': 0.030303030303030304,
  'a': 0.030303030303030304,
  'wkly': 0.030303030303030304,
  'comp': 0.030303030303030304,
  'to': 0.09090909090909091,
  'win': 0.030303030303030304,
  'fa': 0.06060606060606061,
  'cup': 0.030303030303030304,
  'final': 0.030303030303030304,
  'tkts': 0.030303030303030304,
  '21st': 0.030303030303030304,
  'may': 0.030303030303030304,
  '2005': 0.030303030303030304,

In [17]:
idf = calculate_idf(tokenized_sentences)
idf

{'catch': 6.322924241905651,
 'pin': 6.679599185844383,
 'horo': 7.932362154339751,
 'golden': 7.932362154339751,
 'avenge': 7.932362154339751,
 'weren': 7.932362154339751,
 '4few': 7.932362154339751,
 '18p': 7.5268970462315865,
 '1500': 7.239214973779806,
 'lionm': 7.5268970462315865,
 'loosu': 7.932362154339751,
 'kindly': 7.239214973779806,
 'sleeps': 7.932362154339751,
 '09065174042': 7.5268970462315865,
 'ny': 7.5268970462315865,
 'twilight': 7.5268970462315865,
 'ignorant': 7.932362154339751,
 'recycling': 7.932362154339751,
 'poyyarikatur': 7.932362154339751,
 'over': 4.4060016297235896,
 'lunsford': 7.932362154339751,
 'brighten': 7.932362154339751,
 'ummma': 7.932362154339751,
 'jason': 7.239214973779806,
 'sleeping': 5.580986897176273,
 '09061749602': 7.932362154339751,
 'among': 7.239214973779806,
 'disturbing': 7.5268970462315865,
 'w14rg': 7.932362154339751,
 'stands': 7.932362154339751,
 'weds': 7.932362154339751,
 'lab': 7.016071422465596,
 'twice': 6.833749865671641,
 '

In [18]:
tfidf = [calculate_tf_idf(tf, idf) for tf in tfs]
tfidf

[{'go': 0.1526677651360966,
  'until': 0.26466524123622465,
  'jurong': 0.3966181077169876,
  'point': 0.2993226002642219,
  'crazy': 0.2958729566898743,
  'available': 0.28405351778666277,
  'only': 0.1636825600711815,
  'in': 0.0966712796196534,
  'bugis': 0.32730338966099304,
  'n': 0.18946137139741093,
  'great': 0.19670807263352763,
  'world': 0.2520995198221793,
  'la': 0.32730338966099304,
  'e': 0.20855810193230942,
  'buffet': 0.3763448523115793,
  'cine': 0.32730338966099304,
  'there': 0.16297666599389227,
  'got': 0.15958936656727285,
  'amore': 0.3966181077169876,
  'wat': 0.19953901733350304},
 {'ok': 0.4984532886217411,
  'lar': 0.8313205291955518,
  'joking': 1.1132665309740637,
  'wif': 0.8882787448158945,
  'u': 0.31795077331266747,
  'oni': 1.1693452370775992},
 {'free': 0.09658878866595458,
  'entry': 0.33542223524493214,
  'in': 0.05858865431494145,
  '2': 0.07819709154132377,
  'a': 0.04729546385175438,
  'wkly': 0.17931694344840868,
  'comp': 0.1836533326496412,


Now, let's convert this into a DataFrame

In [19]:
tfidf_df = pd.DataFrame(tfidf)
tfidf_df = tfidf_df.fillna(0)
tfidf_df

Unnamed: 0,go,until,jurong,point,crazy,available,only,in,bugis,n,...,nmde,dump,heap,lowes,salesman,087187272008,now1,pity,suggestions,bitching
0,0.152668,0.264665,0.396618,0.299323,0.295873,0.284054,0.163683,0.096671,0.327303,0.189461,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.058589,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.099202,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.240375,0.240375,0.000000,0.000000,0.000000
5568,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
5569,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.193343,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.793236,0.793236,0.000000
5570,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.071608,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.293791


Lastly, let's split the data into training and testing portion

In [33]:
def train_test_split(X: np.ndarray, test_size=.2, random_state=0):
    np.random.seed(random_state)
    indices = np.random.permutation(len(X))
    data_test_size = int(test_size * X.shape[0])
    train_indices = indices[data_test_size:]
    test_indices = indices[:data_test_size]
    
    training_data = X[train_indices]
    testing_data = X[test_indices]
    
    return training_data, testing_data
    

In [34]:
training, testing = train_test_split(tfidf_df.to_numpy())

Let's make two new DataFrames for those

In [39]:
training_df = pd.DataFrame(training)
testing_df = pd.DataFrame(testing)

### Making The Model

Let's first make the euclidean distance calculator

In [20]:
def euclidean_distance(x1: float, x2: float):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [21]:
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X: np.ndarray, y: np.ndarray):
        self.X_train = X
        self.y_train = y

    def predict(self, X: np.ndarray):
        predicted_labels = [self._predict(x) for x in X]
        return np.array(predicted_labels)

    def _predict(self, x: np.ndarray):
        # Compute distances between x and all samples in the training set
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_indices = np.argsort(distances)[:self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Return the most common class label
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]


For the sake of performance, we would try the feature selections method to reduce the features of the data. But we will try the full featured data and PCA later on

In [40]:
feature_selection_threshold = 0.027 # Set the threshold
# All rows, conditional columns
selected_features = training_df.loc[:, (training_df.sum(axis=0) / len(training_df)) > feature_selection_threshold]

Here's the selected features

In [41]:
selected_features

Unnamed: 0,20,61,87
0,0.0,0.000000,0.033909
1,0.0,0.000000,0.000000
2,0.0,0.070763,0.000000
3,0.0,0.000000,0.000000
4,0.0,0.099069,0.000000
...,...,...,...
4453,0.0,0.000000,0.000000
4454,0.0,0.000000,0.044809
4455,0.0,0.116552,0.000000
4456,0.0,0.082557,0.104554


Let's try to apply PCA

In [24]:
def apply_pca(X: np.ndarray, n_dimensions=2):
    # Get the mean along the first axis
    mean_X = np.mean(X)
    
    # Get the X centered
    X_center = X - mean_X
    
    # Create the covariance matrix
    cov = np.cov(X_center.T)
    
    # Create the eigenvalues and eigenvector using the covariance matrix
    eigenvalues, eigenvector = np.linalg.eig(cov)
    
    # Transpose the eigenvector
    eigenvector = eigenvector.T
    
    # Get the sorted indices reversed
    indices = np.argsort(eigenvalues)[::-1]
    
    # Reorder the eigenvector
    eigenvector = eigenvector[indices]
    
    # Get the desired features based on the n_dimensions
    features = eigenvector[:n_dimensions]
     
    return np.dot(X_center, features.T)

# GUI