In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [45]:
df = pd.read_csv('/content/drive/MyDrive/Capstone | Bangkit 2023/Sentiment Analysis/aggregated_reviews.csv', index_col=0)

In [46]:
df.head()

Unnamed: 0,Place_ID,Place_Name,cleaned_reviews
0,ChIJ-1OO3xhZei4RpiUw_jIlPBQ,TUGU LOR,berada di jalan am sangaji atau lebih tepat da...
1,ChIJ-2etA9ABey4RU42x6nFD9WU,Angkringan Baniyah & Gaza Barbershop,tempat nya asyik tempat nyaman mantap
2,ChIJ-7bqq6lXei4RhCzUxYM12Yg,Angkringan Pak Sugeng,lumayan cukup komplit sesuai kantong di masa p...
3,ChIJ-S-FxAZbei4Rdf1ECy_-xNE,Bilbil Juice Aneka jus buah,langganan banget jus buah tidak abal rasa harg...
4,ChIJ-UhWzNZbei4Rd1P09xwuUY0,Es Coklat Bon@ng.262 Prambanan,coklat enak manis pas dan berasa asli coklat r...


## Remove More Stopwords

In [47]:
indo = stopwords.words('indonesian')

In [28]:
def remove_stopwords(text, stopwords):
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

In [48]:
indo.extend(["tidak", "agak", 'aada', 'ini', 'itu', 'jadi', 'cafe', 'coba', 'atau', 'banget', 'kalau', 'karena', 'ke', 'lah', 'lain', 'mau', 'nya', 'sangat', 'sama'])
indo.extend(['masih', 'seperti', 'sampai', 'tapi', 'untuk', 'tetapi', 'lain', 'cuma', 'dari', 'dalam', 'buat', 'bisa', 'juga', 'saja', 'saya', 'sih', 'ya', 'satu'])
indo.extend(['indo', 'bikin', 'orang', 'salah', 'ribu', 'yoyakarta', 'yogyakarta', 'kesini', 'ha', 'beli', 'sayang', 'deh', 'kayak', 'kali', 'buka', 'sebelah', 'the', 'pakai'])
indo.extend(['mas', 'mbak', 'it', 'overall', 'hehe'])

In [49]:
df.cleaned_reviews = df.cleaned_reviews.apply(lambda x: remove_stopwords(x, indo))

## TF-IDF

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
max_df = 25 means "ignore terms that appear in more than 25 documents".
The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

min_df is used for removing terms that appear too infrequently. For example:

min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
min_df = 5 means "ignore terms that appear in less than 5 documents".
The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

max_features = build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. Otherwise, all features are used.

In [39]:
# Instantiate the vectorizer object and transform the plot column
vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, max_features = 150)
vectorized_data = vectorizer.fit_transform(df['cleaned_reviews']) 

# Create Dataframe from TF-IDFarray
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())

# Assign the movie titles to the index and inspect
tfidf_df.index = df['Place_ID']

In [40]:
tfidf_df.columns[:100]

Index(['ac', 'adem', 'ambil', 'anak', 'area', 'asyik', 'ayam', 'bagus',
       'bakar', 'bakmi', 'bareng', 'barista', 'beragam', 'bersih', 'bumbu',
       'camilan', 'cepat', 'cocok', 'coffee', 'colokan', 'cozy', 'daging',
       'duduk', 'enak', 'es', 'fasilitas', 'favorit', 'foto', 'goreng',
       'gudeg', 'gurih', 'habis', 'harga', 'indoor', 'jalan', 'jam', 'jawa',
       'kafe', 'kantong', 'kasih', 'keluarga', 'kemarin', 'kencang', 'keren',
       'khas', 'konsep', 'kopi', 'kota', 'kuah', 'kursi', 'langsung', 'lantai',
       'lauk', 'lengkap', 'lesehan', 'live', 'lokasi', 'luas', 'lumayan',
       'lupa', 'mahal', 'mahasiswa', 'makan', 'makanan', 'malam', 'mampir',
       'manis', 'mantap', 'masakan', 'masuk', 'meja', 'menarik', 'mencoba',
       'mengobrol', 'menikmati', 'menu', 'menunggu', 'mie', 'minum', 'minuman',
       'mobil', 'motor', 'muda', 'mudah', 'murah', 'musala', 'musik', 'nasi',
       'nih', 'nongkrong', 'nugas', 'nyaman', 'oke', 'outdoor', 'pagi',
       'panas'

In [41]:
tfidf_df.columns[100:]

Index(['pelayanan', 'pengunjung', 'pesan', 'pilihan', 'pinggir', 'pokok',
       'porsi', 'ramah', 'ramai', 'recomended', 'recommended', 'resto',
       'ringan', 'ruangan', 'rumah', 'sambal', 'santai', 'sate', 'sayur',
       'segar', 'sesuai', 'siang', 'sore', 'soto', 'spot', 'standar',
       'strategis', 'suasana', 'suka', 'super', 'susu', 'teh', 'telur',
       'teman', 'tempe', 'tenang', 'terima', 'terjangkau', 'tersedia',
       'toilet', 'tugas', 'tugu', 'ugm', 'unik', 'varian', 'view', 'wajib',
       'warung', 'wifi', 'worth'],
      dtype='object')

In [42]:
tfidf_df.head()

Unnamed: 0_level_0,ac,adem,ambil,anak,area,asyik,ayam,bagus,bakar,bakmi,...,tugas,tugu,ugm,unik,varian,view,wajib,warung,wifi,worth
Place_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ChIJ-1OO3xhZei4RpiUw_jIlPBQ,0.0,0.239419,0.0,0.0,0.182633,0.055952,0.0,0.024794,0.0,0.0,...,0.0,0.091401,0.0,0.118086,0.036297,0.0,0.0,0.0,0.0,0.0
ChIJ-2etA9ABey4RU42x6nFD9WU,0.0,0.0,0.0,0.0,0.0,0.694969,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ChIJ-7bqq6lXei4RhCzUxYM12Yg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ChIJ-S-FxAZbei4Rdf1ECy_-xNE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ChIJ-UhWzNZbei4Rd1P09xwuUY0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.319903,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cosine Similarity

In [43]:
# Import cosine_similarity measure
from sklearn.metrics.pairwise import cosine_similarity

# Create the array of cosine similarity values
cosine_similarity_array = cosine_similarity(tfidf_df)

# Wrap the array in a pandas DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index)


This is suitable for recommending places in the bottom section of place details.

In [44]:
# Print the top 5 rows of the DataFrame
cosine_similarity_df.head()

Place_ID,ChIJ-1OO3xhZei4RpiUw_jIlPBQ,ChIJ-2etA9ABey4RU42x6nFD9WU,ChIJ-7bqq6lXei4RhCzUxYM12Yg,ChIJ-S-FxAZbei4Rdf1ECy_-xNE,ChIJ-UhWzNZbei4Rd1P09xwuUY0,ChIJ-VZXtXdZei4R87hWiz5QC3A,ChIJ-XQW5GRZei4Rqo3-9FJkZ34,ChIJ-YSIVXJXei4RruOW9y8zbV4,ChIJ-YsB5eJaei4R4u9P73I8J3E,ChIJ-dg2e55bei4R2NFzQbFGu80,...,ChIJycBiwr5Zei4Rhd7p2_cW6Zw,ChIJyeEZeL5Xei4RoqmIq02SNkE,ChIJyff7yXVXei4RgIxBaCM9e2c,ChIJyzl7G_1Xei4RPn7I1meS0rU,ChIJz8NNJzpYei4R9Wh7rwSq9cY,ChIJz8b_3FFnei4RyQzP4v9c3nM,ChIJzRJJzqIBey4Rb_-C8fFfHKM,ChIJzXaqSjJYei4RYf7_5RITA9Y,ChIJzdIkjZxZei4RgAq0Zm4aqic,ChIJzfDAL5JXei4RB_2bokRgwJ0
Place_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ChIJ-1OO3xhZei4RpiUw_jIlPBQ,1.0,0.096471,0.269423,0.052897,0.212137,0.338097,0.166734,0.039259,0.201776,0.250161,...,0.364485,0.134982,0.282421,0.630335,0.438664,0.244191,0.414049,0.288705,0.070533,0.413232
ChIJ-2etA9ABey4RU42x6nFD9WU,0.096471,1.0,0.0,0.143758,0.0,0.120314,0.0,0.0,0.013417,0.162882,...,0.288355,0.028892,0.118835,0.201113,0.196066,0.048236,0.136034,0.21816,0.0,0.08839
ChIJ-7bqq6lXei4RhCzUxYM12Yg,0.269423,0.0,1.0,0.0,0.0,0.021893,0.0,0.0,0.022385,0.078114,...,0.075748,0.038334,0.134088,0.213149,0.113404,0.027976,0.310944,0.027803,0.0,0.114734
ChIJ-S-FxAZbei4Rdf1ECy_-xNE,0.052897,0.143758,0.0,1.0,0.040073,0.199924,0.0,0.0,0.040333,0.281476,...,0.120859,0.071862,0.119611,0.170427,0.109857,0.10697,0.039483,0.039695,0.059644,0.070891
ChIJ-UhWzNZbei4Rd1P09xwuUY0,0.212137,0.0,0.0,0.040073,1.0,0.285787,0.198815,0.16975,0.192921,0.338035,...,0.26655,0.230762,0.070629,0.267892,0.203084,0.247545,0.022155,0.08053,0.037579,0.238956


In [51]:
# Find the values for the place with specific ID : ChIJz8b_3FFnei4RyQzP4v9c3nM	(Warung Makan Restu Ibu	)
cosine_similarity_series = cosine_similarity_df.loc['ChIJz8b_3FFnei4RyQzP4v9c3nM']

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

# Print the results
print(ordered_similarities)

Place_ID
ChIJz8b_3FFnei4RyQzP4v9c3nM    1.000000
ChIJdTyDMeZZei4RlRet81pF5KY    0.791688
ChIJf9MAObRZei4RUew1n52hm94    0.788387
ChIJbbKlBQdbei4RZHMyjwkJYGQ    0.732685
ChIJ236W0kdZei4R-ADtGTXSdO8    0.668787
                                 ...   
ChIJK267UEYBey4RamHV9IVnTQ4    0.000000
ChIJ__OA4J3lei4RHlfB85vTctc    0.000000
ChIJSQwX1GJRei4R_34D8-GBghk    0.000000
ChIJW3YBzrhZei4R1BDCDA3cHh8    0.000000
ChIJjU9NTCRYei4RJMKv7vPCTyg    0.000000
Name: ChIJz8b_3FFnei4RyQzP4v9c3nM, Length: 357, dtype: float64


# Recommendation based on user preferences

In [55]:
df.Place_Name[30:45]

30                Coldhans Coffee Uny
31    Kantin Aldhan Fak. Filsafat UGM
32       Rumah Makan Padang Prambanan
33              Toegoe Coffee and Tea
34               Lesehan Tasik Malaya
35           Bercanda Coffee & Eatery
36     De Djogdja Karaoke Lounge Cafe
37                 Omah Kayu Wonosari
38                 Angkringan Farmasi
39           Inggar Food And Beverage
40                          Kopi Tugu
41                       Hen's Coffee
42                     House of Kijoy
43                      Mamahké Jogja
44                          Slide Bar
Name: Place_Name, dtype: object

In [52]:
tfidf_df = tfidf_df[~tfidf_df.index.duplicated()]

In [57]:
list_of_favorite_places = ['Bakso Prasojo', 'Mucho Cafe & Bar', 'Bercanda Coffee & Eatery', 'House of Kijoy']
id_of_favorite_places = [df.loc[df['Place_Name'] == place, 'Place_ID'].values[0] for place in list_of_favorite_places]

# Create a subset of only the user favorite places
favorite_places_df = tfidf_df.reindex(id_of_favorite_places)

# Inspect the DataFrame
favorite_places_df

Unnamed: 0_level_0,ac,adem,ambil,anak,area,asyik,ayam,bagus,bakar,bakmi,...,tugas,tugu,ugm,unik,varian,view,wajib,warung,wifi,worth
Place_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ChIJ8fnq61ZLei4RhQv8BUXkQVY,0.0,0.0,0.0,0.0,0.0,0.0,0.433874,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174503,0.0,0.0
ChIJ7d_nCsBZei4R4nM70A44-UE,0.0,0.0,0.0,0.0,0.0,0.050183,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.082757,0.0,0.0,0.0,0.0
ChIJ3RljhQtZei4ReFjQ-aoXzns,0.0,0.101476,0.0,0.0,0.0,0.0,0.0,0.063054,0.0,0.132727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ChIJ51du8_JZei4Rew9VMC2ahw0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
# Calculate the mean
user_prof = favorite_places_df.mean()

In [66]:
# Find subset of tfidf_df that does not include places in id_of_favorite_places
tfidf_subset_df = tfidf_df.drop(id_of_favorite_places, axis=0)

# Calculate the cosine_similarity and wrap it in a DataFrame
similarity_array = cosine_similarity(user_prof.values.reshape(1, -1), tfidf_subset_df)
similarity_df = pd.DataFrame(similarity_array.T, index=tfidf_subset_df.index, columns=["similarity_score"])

# Sort the values from high to low by the values in the similarity_score
sorted_similarity_df = similarity_df.sort_values(by="similarity_score", ascending=False)

# Reset the Index
sorted_similarity_df = sorted_similarity_df.reset_index()

In [67]:
# Inspect the most similar to the user preferences
sorted_similarity_df.head(10)

Unnamed: 0,Place_ID,similarity_score
0,ChIJQdv6ziP7ei4RvDkASKT5P-s,0.607968
1,ChIJ94jeWt5aei4RzjGh11KmctM,0.600892
2,ChIJycBiwr5Zei4Rhd7p2_cW6Zw,0.600524
3,ChIJ51fI4pNXei4RvPYohabbBEU,0.598739
4,ChIJB7kWsMXlei4RXXZsvIJl5H4,0.597854
5,ChIJHfSeNgNZei4RT3WiyG3HX_U,0.593517
6,ChIJa4xFOZNZei4RddSVY-39S8s,0.593327
7,ChIJAd_fLEOzey4REvhX95u2-sc,0.572277
8,ChIJW3qqdF9Zei4R__z2cQjHXQA,0.570604
9,ChIJxbpgRkhYei4Ri0J5gvxo4CE,0.569101


In [68]:
recommended_places = [df.loc[df['Place_ID'] == ID, 'Place_Name'].values[0] for ID in sorted_similarity_df.Place_ID]

In [70]:
recommended_places[:10]

['Java INDO Cafe & Resto',
 'Kedai SUKA_SUKA Prambanan',
 'Nyore Coffee & Space',
 'Mamahké Jogja',
 'kopi 01',
 'Barley and Barrel',
 'Goodwin Bar and Eatery',
 'Mrikiniki Angkringan And Gallery',
 'Kuliner Lor Tugu',
 'Eboni Bar And Lounge']