In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Baca data CSV
df = pd.read_csv("scrapped_Tiktok_EN.csv")  # Ganti dengan path file asli

# Hapus baris dengan content kosong
df = df.dropna(subset=["content"])

# Konversi ke string
df["content"] = df["content"].astype(str)

# Filter data berdasarkan tahun 2024
df["at"] = pd.to_datetime(df["at"])
df_filtered = df[df["at"].dt.year == 2024]

# Ambil 6000 sampel
df = df_filtered.sample(n=6000, random_state=42)

# Fungsi Tokenisasi
def tokenize(text):
    cleaned_text = re.sub(r'\d+', '', text)  # Hapus angka
    cleaned_text = re.sub(r"[^\w\s]", "", cleaned_text)
    cleaned_text = re.sub(r'@[A-Za-z0-9_]+', '', cleaned_text)
    cleaned_text = re.sub(r'#\w+', '', cleaned_text)
    cleaned_text = re.sub(r'RT[\s]+', '', cleaned_text)
    cleaned_text = re.sub(r'https?://\S+', '', cleaned_text)
    cleaned_text = re.sub(r'[^A-Za-z0-9 ]', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text.lower()

df["cleaned_content"] = df["content"].apply(tokenize)

# TF-IDF dengan Sklearn
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(df["cleaned_content"])

# Konversi ke DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Statistik TF-IDF
statistik_tfidf = tfidf_df.describe()

# Menentukan threshold berdasarkan persentil 10% dari rata-rata TF-IDF kata
threshold = np.percentile(statistik_tfidf.loc['mean'], 10)
print(f"Threshold: {threshold}")

# Mencari kata dengan TF-IDF di bawah threshold, tanpa angka dan panjang minimal 3 karakter
nonsense_word = [word for word in statistik_tfidf.loc['mean'][statistik_tfidf.loc['mean'] <= threshold].index.tolist() if not word.isdigit() and len(word) > 2]
print("Kata yang dihapus:", nonsense_word[:20])  # Menampilkan hanya 20 kata pertama sebagai contoh

# Menampilkan DataFrame TF-IDF hasil akhir
tfidf_df.head()

Threshold: 3.6888397607058426e-05
Kata yang dihapus: ['abroad', 'acci', 'accountability', 'accumulate', 'admit', 'admits', 'adventure', 'aemilst', 'affandie', 'againbut', 'agents', 'agitating', 'aigenerated', 'air', 'akin', 'akoa', 'alarm', 'alerting', 'algorithmic', 'algorithms']


Unnamed: 0,aa,aaaaa,aaaarghhhh,ability,abit,able,abnd,aboit,abroad,abruptly,...,zillions,zionist,zionlst,zionlsts,zone,zones,zoom,zooms,ztezral,zub
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
tfidf_df

Unnamed: 0,aa,aaaaa,aaaarghhhh,ability,abit,able,abnd,aboit,abroad,abruptly,...,zillions,zionist,zionlst,zionlsts,zone,zones,zoom,zooms,ztezral,zub
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,cleaned_content
2360,aed8e7d9-f07c-4eb4-aaad-623e8749eaa3,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Why can't I put caption on my repost?!?!!,1,0,37.0.4,2024-10-26 09:03:07,,,37.0.4,why cant i put caption on my repost
9545,a6ef5e66-de49-4a93-b98a-c66bc686bc9d,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Sometimes I can't open it please fix it,4,0,28.5.4,2024-08-18 16:02:28,,,28.5.4,sometimes i cant open it please fix it
42,2d33acbf-a885-441a-8a12-3d649a84c217,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"I hate tiktok, When I want to create a new acc...",1,25,37.2.6,2024-11-19 11:52:07,,,37.2.6,i hate tiktok when i want to create a new acco...
3887,9167075e-8c4c-46c9-825d-3035bfea9117,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,It has many bugs it's not worth it to install ...,1,0,,2024-10-10 11:27:00,,,,it has many bugs its not worth it to install i...
3710,762ea22c-531a-470b-be7f-91b218adfca6,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Earning coins while watching,5,0,36.7.4,2024-10-12 09:57:52,,,36.7.4,earning coins while watching
...,...,...,...,...,...,...,...,...,...,...,...,...
6383,0a912616-eabe-4d4d-b094-0bc435c0e188,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Very nice and good app 💯,5,0,36.2.4,2024-09-22 09:17:25,,,36.2.4,very nice and good app
944,7a5997d9-4442-459b-978e-7d9048ed5de3,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,t this past few days when i try to open tiktok...,2,0,,2024-11-10 10:31:28,,,,t this past few days when i try to open tiktok...
11098,eaf9b915-31af-450d-b1f9-30c973fa9442,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Tiktok kept violating my video even tho it's n...,1,1,35.7.3,2024-07-28 04:25:00,,,35.7.3,tiktok kept violating my video even tho its no...
9840,204b12d5-cd3f-4ef6-81cf-aa6676fc892c,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,I have to reinstall just to get into the app e...,2,0,,2024-08-15 06:09:03,,,,i have to reinstall just to get into the app e...


In [3]:
statistik_tfidf

Unnamed: 0,aa,aaaaa,aaaarghhhh,ability,abit,able,abnd,aboit,abroad,abruptly,...,zillions,zionist,zionlst,zionlsts,zone,zones,zoom,zooms,ztezral,zub
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,...,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,0.000142,8.6e-05,8.2e-05,0.000264,5e-05,0.001986,7.2e-05,5.9e-05,2.9e-05,9.9e-05,...,3.2e-05,4.1e-05,5e-05,5e-05,0.000214,4.2e-05,0.00039,5.2e-05,4.9e-05,5.5e-05
std,0.007898,0.006693,0.006341,0.009356,0.003871,0.02523,0.005568,0.004573,0.002247,0.005845,...,0.002469,0.003167,0.003844,0.003844,0.01004,0.003261,0.014109,0.004012,0.003817,0.004232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.496632,0.518401,0.491161,0.424253,0.299833,0.536163,0.431266,0.354243,0.174071,0.415679,...,0.191259,0.24532,0.297751,0.297751,0.580244,0.252598,0.659789,0.310752,0.295654,0.327821


In [3]:
import numpy as np
from collections import Counter

class KNN:
    def __init__(self, k=3):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)
    
    def predict(self, X_test):
        X_test = np.array(X_test)
        predictions = [self._predict(x) for x in X_test]
        return np.array(predictions)
    
    def _predict(self, x):
        distances = [np.linalg.norm(x - x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]
    
    def accuracy(self, y_true, y_pred):
        correct = np.sum(np.array(y_true) == np.array(y_pred))
        return correct / len(y_true)
    
# Contoh penggunaan
if __name__ == "__main__":
    # Data dummy
    X_train = [[1, 2], [2, 3], [3, 3], [5, 5], [6, 7], [7, 8]]
    y_train = [0, 0, 0, 1, 1, 1]
    
    X_test = [[4, 4], [6, 6]]
    y_test = [0, 1]  # Label sebenarnya untuk data uji
    
    for k in [1, 3, 5]:
        model = KNN(k=k)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        acc = model.accuracy(y_test, predictions)
        print(f"k={k}, Predictions: {predictions}, Akurasi: {acc:.2f}")


k=1, Predictions: [0 1], Akurasi: 1.00
k=3, Predictions: [0 1], Akurasi: 1.00
k=5, Predictions: [0 1], Akurasi: 1.00


In [2]:
def accuracy(y_true, y_pred):
    correct = np.sum(np.array(y_true) == np.array(y_pred))
    return correct / len(y_true)

# Contoh evaluasi
y_test = [0, 1]  # Label sebenarnya untuk data uji
y_pred = predictions  # Hasil prediksi dari model

acc = accuracy(y_test, y_pred)
print(f"Akurasi: {acc:.2f}")


Akurasi: 1.00
