In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec


In [3]:
df = pd.read_csv('/content/car_dataset.csv')

In [5]:
# Preprocess the text data by tokenizing and cleaning the text
nltk.download('punkt')
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
df['cleaned_text'] = df['Make'] + ' ' + df['Model'] + ' ' + df['Engine Fuel Type'] + ' ' + df['Transmission Type'] + ' ' + df['Driven_Wheels'] + ' ' + df['Market Category'] + ' ' + df['Vehicle Size'] + ' ' + df['Vehicle Style']
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(tokenizer.tokenize(str(x).lower())))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
# Bag-of-Words (Count occurrence)
count_vectorizer = CountVectorizer()
count_occurrence = count_vectorizer.fit_transform(df['cleaned_text'])
print("Bag-of-Words (Count occurrence):\n", count_occurrence.toarray())

Bag-of-Words (Count occurrence):
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]


In [21]:
# Bag-of-Words (Normalized count occurrence)
normalized_count_vectorizer = CountVectorizer()
normalized_count_occurrence = normalized_count_vectorizer.fit_transform(df['cleaned_text'])
normalized_count_occurrence = normalized_count_occurrence / normalized_count_occurrence.sum(axis=1).reshape(-1, 1)
print("Bag-of-Words (Normalized count occurrence):\n", normalized_count_occurrence)

Bag-of-Words (Normalized count occurrence):
 [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.06666667 0.        ]
 [0.         0.         0.         ... 0.         0.06666667 0.        ]
 [0.         0.         0.         ... 0.         0.         0.09090909]]


In [22]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])
print("TF-IDF:\n", tfidf_matrix.toarray())

TF-IDF:
 [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.60490185 0.        ]
 [0.         0.         0.         ... 0.         0.60206913 0.        ]
 [0.         0.         0.         ... 0.         0.         0.76717912]]


In [23]:
# Word2Vec embeddings
sentences = [nltk.tokenize.word_tokenize(text) for text in df['cleaned_text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_embeddings = [word2vec_model.wv[word] for word in word2vec_model.wv.key_to_index]
print("Word2Vec embeddings:\n", word2vec_embeddings)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        0.01690506,  0.02640245, -0.00665089,  0.01465668, -0.02339124,
        0.03283738, -0.02771667, -0.00368943, -0.00044789, -0.00047185,
       -0.00149107,  0.02059451,  0.03338634, -0.02502267,  0.01654622,
        0.01812135, -0.00970425,  0.03985618, -0.00627562, -0.02915495,
       -0.02603225, -0.01803379, -0.08450749, -0.05193356, -0.01780581,
        0.00277754, -0.0339165 ,  0.00832359,  0.03676365, -0.03960546,
        0.05481447, -0.00226776,  0.03796133,  0.00928933, -0.00980184,
        0.03384171,  0.02347477,  0.0047242 , -0.04487516, -0.00446959,
       -0.02368598,  0.00073105, -0.03685532, -0.00518281, -0.03994422,
       -0.03012152,  0.00240532,  0.00572946, -0.03238533,  0.02397289,
        0.04248916, -0.04529782, -0.0108279 ,  0.0432061 ,  0.03566165,
        0.01583431,  0.00539003, -0.00521289, -0.04425322,  0.01424976,
       -0.00241351, -0.01674064,  0.03379578,  0.0139991 , -0.04731597,