# Task 1: Count Vectorize on the Following Sentences

In [1]:
sentences = [
    "One Cent, Two Cents, Old Cent, New Cent: All About Money (Cat in the Hat's Learning Library)",
    "Inside Your Outside: All About the Human Body (Cat in the Hat's Learning Library)",
    "Oh, The Things You Can Do That Are Good for You: All About Staying Healthy (Cat in the Hat's Learning Library)",
    "On Beyond Bugs: All About Insects (Cat in the Hat's Learning Library)",
    "There's No Place Like Space: All About Our Solar System (Cat in the Hat's Learning Library)",
]

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
X = vec.fit_transform(sentences)

In [3]:
df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
df.head()

Unnamed: 0,about,all,are,beyond,body,bugs,can,cat,cent,cents,...,space,staying,system,that,the,there,things,two,you,your
0,1,1,0,0,0,0,0,1,3,1,...,0,0,0,0,1,0,0,1,0,0
1,1,1,0,0,1,0,0,1,0,0,...,0,0,0,0,2,0,0,0,0,1
2,1,1,1,0,0,0,1,1,0,0,...,0,1,0,1,2,0,1,0,2,0
3,1,1,0,1,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1,1,0,0,0,0,0,1,0,0,...,1,0,1,0,1,1,0,0,0,0


# Task 2: Cosine Similarity

In [4]:
other_sentences = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

data = other_sentences
count_vectorizer = CountVectorizer()  # Create a CountVectorizer instance
vector_matrix = count_vectorizer.fit_transform(data)  # Fit and transform the documents into numerical vectors

In [6]:
cosine_similarity_matrix = cosine_similarity(vector_matrix)
df_cosine = pd.DataFrame(data=cosine_similarity_matrix, index=data, columns=data)
df_cosine

Unnamed: 0,This is the first document.,This document is the second document.,And this is the third one.,Is this the first document?
This is the first document.,1.0,0.790569,0.547723,1.0
This document is the second document.,0.790569,1.0,0.433013,0.790569
And this is the third one.,0.547723,0.433013,1.0,0.547723
Is this the first document?,1.0,0.790569,0.547723,1.0


# Task 3: TF-IDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

last_sentences = [
    "data science is one of the most important fields of science",
    "this is one of the best data science courses",
    "data scientists analyze data",
]

In [8]:
tfidf = TfidfVectorizer()
result = tfidf.fit_transform(last_sentences)

In [9]:
df = pd.DataFrame(result.toarray(), columns=tfidf.get_feature_names_out())
df

Unnamed: 0,analyze,best,courses,data,fields,important,is,most,of,one,science,scientists,the,this
0,0.0,0.0,0.0,0.189526,0.320895,0.320895,0.244049,0.320895,0.488098,0.244049,0.488098,0.0,0.244049,0.0
1,0.0,0.400294,0.400294,0.23642,0.0,0.0,0.304434,0.0,0.304434,0.304434,0.304434,0.0,0.304434,0.400294
2,0.542701,0.0,0.0,0.641055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.542701,0.0,0.0


In [10]:
# the most important word per document:
df.idxmax(axis=1)

0      of
1    best
2    data
dtype: object