### Packages Needed
1. Numpy
2. Pandas
3. Scikit-Learn

### Import Packages

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Read Dataset

In [2]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,text
0,"One day, a little girl named Lily found a need..."
1,"Once upon a time, there was a little car named..."
2,"One day, a little fish named Fin was swimming ..."
3,"Once upon a time, in a land full of trees, the..."
4,"Once upon a time, there was a little girl name..."


In [3]:
print(df.isnull().sum())  # Cek jumlah nilai yang hilang di setiap kolom

text    230
dtype: int64


In [4]:
# Hapus baris dengan nilai NaN
df = df.dropna(subset=["text"])

In [5]:
# pakai 50 row pertama saja
df = df.iloc[:50]

In [6]:
df.head()

Unnamed: 0,text
0,"One day, a little girl named Lily found a need..."
1,"Once upon a time, there was a little car named..."
2,"One day, a little fish named Fin was swimming ..."
3,"Once upon a time, in a land full of trees, the..."
4,"Once upon a time, there was a little girl name..."


### TF-IDF Search Engine

In [7]:
# Get tf-idf matrix using fit_transform function
vectorizer = TfidfVectorizer()

In [8]:
%%time
X = vectorizer.fit_transform(df['text']) # Store tf-idf representations of all docs

CPU times: user 19.3 ms, sys: 9.18 ms, total: 28.5 ms
Wall time: 32.4 ms


In [9]:
print(X.shape)  # (Number of songs, Number of unique words)

(50, 789)


# Query Processing

In [10]:
query = "girl named Lily found a needle in her room. She knew"

In [11]:
%%time
query_vec = vectorizer.transform([query]) # Ip -- (n_docs,x), Op -- (n_docs,n_Feats)
results = cosine_similarity(X,query_vec).reshape((-1,)) # Op -- (n_docs,1) -- Cosine Sim with each doc

CPU times: user 3.24 ms, sys: 952 µs, total: 4.2 ms
Wall time: 4 ms


# Print Results

In [24]:
print(results)
print(results.argsort()[-10:][::-1])
print(results[0], results[6], results[4])
# Print Top 10 results
# for i in results.argsort()[-10:][::-1]:
#     print(f"{i + 1}. ", df.iloc[i, 0])

[0.52321854 0.02463798 0.00294634 0.00824288 0.26996753 0.0395428
 0.33253197 0.02496971 0.00954398 0.02808979 0.07206285 0.02086633
 0.04391447 0.06955948 0.00935825 0.06407749 0.02059047 0.01569318
 0.01812905 0.00476652 0.00342884 0.03896741 0.09652231 0.13729711
 0.04936799 0.02840666 0.01060027 0.00540256 0.11253664 0.19414775
 0.02751961 0.01897174 0.01041642 0.16108531 0.04567445 0.06749164
 0.0071582  0.04362062 0.03433662 0.03668542 0.06589963 0.01300109
 0.0384336  0.02763074 0.25581129 0.01368093 0.21008238 0.18614706
 0.00339429 0.0052587 ]
[ 0  6  4 44 46 29 47 33 23 28]
0.5232185388716842 0.3325319718493395 0.269967528634867


In [21]:
0.13729710674161555 > 0.1610853090078817

False