In [None]:
import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import gensim
import numpy as np
import nltk
import re

Import faiss dan time untuk melakukan indexing dan menghitung estimasi waktu

In [None]:
!pip install faiss-cpu
import faiss
import time



Input data dan memeriksa data null

In [None]:
df = pd.read_csv(r"/content/DisneylandReviews.csv", encoding='latin1')
df.isna().sum()

Review_ID            0
Rating               0
Year_Month           0
Reviewer_Location    0
Review_Text          0
Branch               0
dtype: int64

Praprocess sebelum melakukan embedding

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

df['Token Text'] = df['Review_Text'].apply(preprocess_text)

df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Token Text
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,"[ever, disneyland, anywhere, find, disneyland,..."
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,"[since, last, time, visit, hk, disneyland, yet..."
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,"[thanks, god, hot, humid, visiting, park, othe..."
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,"[hk, disneyland, great, compact, park, unfortu..."
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,"[location, city, took, around, hour, kowlon, k..."


Membuat model Word2Vec

In [None]:
token = df['Token Text']
model = Word2Vec(sentences=token, vector_size=100, window=5, min_count=1, workers=4)

Mengkonversi teks menjadi vektor

In [None]:
def get_vector(tokens, model):
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

df['Vector'] = df['Token Text'].apply(lambda tokens: get_vector(tokens, model))
df['Token Text'] = df['Token Text'].apply(lambda tokens: ' '.join(tokens))
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Token Text,Vector
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,ever disneyland anywhere find disneyland hong ...,"[-0.37314057, -0.011646227, -0.117439255, -0.1..."
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,since last time visit hk disneyland yet time s...,"[-0.3519038, 0.3384453, 0.0044576298, -0.01966..."
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,thanks god hot humid visiting park otherwise w...,"[-0.0318425, 0.1519244, 0.39359772, -0.1494658..."
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,hk disneyland great compact park unfortunately...,"[-0.049449023, 0.06282451, 0.31434014, 0.26956..."
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,location city took around hour kowlon kids lik...,"[-0.364037, 0.2403189, 0.34318724, -0.05786837..."


Hasil dari embedding

In [None]:
model.wv.similar_by_word('disneyland')

[('dl', 0.7021290063858032),
 ('disney', 0.625153660774231),
 ('chatlete', 0.576385498046875),
 ('dlp', 0.5354707837104797),
 ('disneylands', 0.5093713998794556),
 ('orlando', 0.5017033815383911),
 ('compare', 0.4742244780063629),
 ('comute', 0.47126877307891846),
 ('eurodisney', 0.46853598952293396),
 ('knockoff', 0.46504223346710205)]

Membuat index

In [None]:
embeddings = np.stack(df['Vector'].values)

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

Query tanpa optimasi

In [None]:
def calculate_review_vector(review, model):
    words = preprocess_text(review)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

def find_similar_reviews(query, model, df):
    query_vector = calculate_review_vector(query, model)
    similarities = []
    for i, row in df.iterrows():
        review_vector = row['Vector']
        similarity = np.dot(query_vector, review_vector) / (np.linalg.norm(query_vector) * np.linalg.norm(review_vector))
        similarities.append((i, similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_indices = [x[0] for x in similarities[:5]]
    return top_indices

queries = [
    "I can go on talking about Disneyland.",
    "This place is HUGE!",
    "It was indeed the happiest place on earth!",
    "The park is great if this is the only amusement park available to you.",
    "Awesome attractions that blew my mind away.",
    "We were disappointed with HK Disneyland.",
    "Spent a few hours at Hong Kong Disneyland."
]

start_time = time.time()

for query in queries:
    similar_indices = find_similar_reviews(query, model, df)
    print(f"Query: {query}")
    print("Top 5 Most Similar Reviews:")
    for i, idx in enumerate(similar_indices):
        print(f"Result {i+1}: {df.iloc[idx]['Review_Text']}")
    print()

end_time = time.time()

total_execution_time = end_time - start_time
print(f"Total execution time for all queries: {total_execution_time:.4f} seconds")

Query: I can go on talking about Disneyland.
Top 5 Most Similar Reviews:
Result 1: For us, Disneyland is a yearly event.  Even if a problem arises on any given visit, it never disappoints.  Others go to far off places for relaxation, I go to Disneyland and leave there always wanting to come back.
Result 2: It's Disneyland and Disneyland is always awesome. February is a great time to go. Can't wait to go again.
Result 3: Ok, so, everyone knows about Disneyland, but, they probably don   t know everything, go ahead and read, Disneyland Secrets, it tells all.
Result 4: It's Disneyland!!!  What's not to like???  Seriously, if you're not sure whether to go or not... just go.  You will have an amazing time!
Result 5: Disneyland really is the happiest place on earth!! Every time I go it gets better and better. When ever I want to go on a vacation I think Disneyland. And every time I go I discover new things that I love about the park. Last time I went I found the Disney Gallery. To get there t

Query dengan optimasi

In [None]:
import time

def calculate_review_vector(review, model):
    words = preprocess_text(review)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

def find_similar_reviews(query, model, index):
    query_vector = calculate_review_vector(query, model)
    D, I = index.search(np.array([query_vector]), 5)
    return I[0]

queries = [
    "I can go on talking about Disneyland.",
    "This place is HUGE!",
    "It was indeed the happiest place on earth!",
    "The park is great if this is the only amusement park available to you.",
    "Awesome attractions that blew my mind away.",
    "We were disappointed with HK Disneyland.",
    "Spent a few hours at Hong Kong Disneyland."
]

start_time = time.time()

for query in queries:
    similar_indices = find_similar_reviews(query, model, index)
    print(f"Query: {query}")
    print("Top 5 Most Similar Reviews:")
    for i, idx in enumerate(similar_indices):
        print(f"Result {i+1}: {df.iloc[idx]['Review_Text']}")
    print()

end_time = time.time()

total_execution_time = end_time - start_time
print(f"Total execution time for all queries: {total_execution_time:.4f} seconds")

Query: I can go on talking about Disneyland.
Top 5 Most Similar Reviews:
Result 1: Ok, so, everyone knows about Disneyland, but, they probably don   t know everything, go ahead and read, Disneyland Secrets, it tells all.
Result 2: It's Disneyland!!!  What's not to like???  Seriously, if you're not sure whether to go or not... just go.  You will have an amazing time!
Result 3: Disneyland really is the happiest place on earth!! Every time I go it gets better and better. When ever I want to go on a vacation I think Disneyland. And every time I go I discover new things that I love about the park. Last time I went I found the Disney Gallery. To get there take a left right when you go into the park it is next to Great Moments with Abraham Lincoln. If you want to find out about the history of Disneyland and collect Disneyland memorabilia you have to stop here. There are original concepts of Disneyland, a video that go's through the history of Disneyland, old park tickets, trains from Walt, pa