In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec


In [2]:
#pip install gensim


In [3]:
data = pd.read_csv('data.csv')

In [4]:
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [5]:
data.describe(include='all')

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
count,11914,11914,11914.0,11911,11845.0,11884.0,11914,11914,11908.0,8172,11914,11914,11914.0,11914.0,11914.0,11914.0
unique,48,915,,10,,,5,4,,71,3,16,,,,
top,Chevrolet,Silverado 1500,,regular unleaded,,,AUTOMATIC,front wheel drive,,Crossover,Compact,Sedan,,,,
freq,1123,156,,7172,,,8266,4787,,1110,4764,3048,,,,
mean,,,2010.384338,,249.38607,5.628829,,,3.436093,,,,26.637485,19.733255,1554.911197,40594.74
std,,,7.57974,,109.19187,1.780559,,,0.881315,,,,8.863001,8.987798,1441.855347,60109.1
min,,,1990.0,,55.0,0.0,,,2.0,,,,12.0,7.0,2.0,2000.0
25%,,,2007.0,,170.0,4.0,,,2.0,,,,22.0,16.0,549.0,21000.0
50%,,,2015.0,,227.0,6.0,,,4.0,,,,26.0,18.0,1385.0,29995.0
75%,,,2016.0,,300.0,6.0,,,4.0,,,,30.0,22.0,2009.0,42231.25


In [6]:
#Removing the NAN Values
data.dropna(inplace=True)
data.reset_index(inplace=True)
data.isnull().sum()


index                0
Make                 0
Model                0
Year                 0
Engine Fuel Type     0
Engine HP            0
Engine Cylinders     0
Transmission Type    0
Driven_Wheels        0
Number of Doors      0
Market Category      0
Vehicle Size         0
Vehicle Style        0
highway MPG          0
city mpg             0
Popularity           0
MSRP                 0
dtype: int64

In [7]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens
data['Market Category'] = data['Market Category'].apply(preprocess)
data['Market Category']

0             [factory, tuner, luxury]
1                [luxury, performance]
2                             [luxury]
3                [luxury, performance]
4                             [luxury]
                     ...              
8079    [crossover, hatchback, luxury]
8080    [crossover, hatchback, luxury]
8081    [crossover, hatchback, luxury]
8082    [crossover, hatchback, luxury]
8083                          [luxury]
Name: Market Category, Length: 8084, dtype: object

In [8]:
#Perform the bag-of-words approach using CountVectorizer
count_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
bow = count_vectorizer.fit_transform(data['Market Category'])
print(bow.toarray()[:5, :10])

[[0 0 0 1 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 1 0]]




In [9]:
normalized_count = bow.copy()
for i, j in zip(*normalized_count.nonzero()):
    normalized_count[i, j] = normalized_count[i, j] / len(data['Market Category'][i])
print(normalized_count.toarray()[:5, :10]) # Print the first 5 rows and 10 columns of the matrix

[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0]]


In [10]:
#TF-IDF to calculate the importance of each word in the document
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)
tfidf = tfidf_vectorizer.fit_transform(data['Market Category'])
print(tfidf.toarray()[:5, :10]) # Print the first 5 rows and 10 columns of the matrix

[[0.         0.         0.         0.6616931  0.         0.
  0.         0.         0.35259679 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.62784336 0.77833972]
 [0.         0.         0.         0.         0.         0.
  0.         0.         1.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.62784336 0.77833972]
 [0.         0.         0.         0.         0.         0.
  0.         0.         1.         0.        ]]


In [11]:
#Creating word embeddings using Word2Vec
model = Word2Vec(data['Market Category'], min_count=1)
embeddings = np.zeros((len(data), 100))
for i, tokens in enumerate(data['Market Category']):
    for token in tokens:
        embeddings[i] += model.wv[token]
    embeddings[i] /= len(tokens)

  embeddings[i] /= len(tokens)


In [12]:
embeddings

array([[-0.01081371,  0.0057243 ,  0.01559494, ..., -0.0384125 ,
         0.01002598,  0.00391788],
       [-0.01936158,  0.00981448,  0.01494137, ..., -0.03419592,
         0.00383476,  0.00652751],
       [-0.01813624,  0.00967213,  0.01674225, ..., -0.04211244,
         0.01080765,  0.00754971],
       ...,
       [-0.01264001,  0.00697713,  0.00522356, ..., -0.02376176,
         0.00932123, -0.00194311],
       [-0.01264001,  0.00697713,  0.00522356, ..., -0.02376176,
         0.00932123, -0.00194311],
       [-0.01813624,  0.00967213,  0.01674225, ..., -0.04211244,
         0.01080765,  0.00754971]])

In [23]:
pip install pandas scikit-learn gensim


Note: you may need to restart the kernel to use updated packages.


In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

In [25]:
# Load the dataset
data = pd.read_csv('data.csv')

In [26]:
# Take a look at the dataset
print(data.head())

  Make       Model  Year             Engine Fuel Type  Engine HP  \
0  BMW  1 Series M  2011  premium unleaded (required)      335.0   
1  BMW    1 Series  2011  premium unleaded (required)      300.0   
2  BMW    1 Series  2011  premium unleaded (required)      300.0   
3  BMW    1 Series  2011  premium unleaded (required)      230.0   
4  BMW    1 Series  2011  premium unleaded (required)      230.0   

   Engine Cylinders Transmission Type     Driven_Wheels  Number of Doors  \
0               6.0            MANUAL  rear wheel drive              2.0   
1               6.0            MANUAL  rear wheel drive              2.0   
2               6.0            MANUAL  rear wheel drive              2.0   
3               6.0            MANUAL  rear wheel drive              2.0   
4               6.0            MANUAL  rear wheel drive              2.0   

                         Market Category Vehicle Size Vehicle Style  \
0  Factory Tuner,Luxury,High-Performance      Compact         C

In [27]:
# Bag-of-Words
# Count occurrence
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(data['Model'])
bow_feature_names = count_vectorizer.get_feature_names_out()
print("\nBag-of-Words (Count occurrence):")
print(bow_matrix.toarray())
print("Feature names:", bow_feature_names)


Bag-of-Words (Count occurrence):
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]
Feature names: ['10' '100' '124' '12c' '15' '150' '1500' '1500hd' '16' '190' '200' '200h'
 '200sx' '200t' '240' '240sx' '250' '2500' '250h' '2x' '300' '3000gt'
 '300h' '300m' '300zx' '323' '330' '350' '350z' '360' '370z' '400' '400h'
 '420' '430' '450' '450h' '456m' '458' '460' '470' '4c' '4runner' '4x'
 '50' '500' '500e' '500l' '500x' '550' '560' '57' '570' '570s' '575m'
 '599' '600' '6000' '600h' '612' '62' '626' '650s' '718' '740' '760' '780'
 '7x' '80' '850' '86' '90' '900' '9000' '911' '928' '929' '940' '944'
 '960' '968' 'a3' 'a4' 'a5' 'a6' 'a7' 'a8' 'acadia' 'accent' 'acclaim'
 'accord' 'achieva' 'activehybrid' 'aerio' 'aerostar' 'alero' 'allante'
 'allroad' 'alltrack' 'alpina' 'altima' 'am' 'amanti' 'amg' 'and' 'armada'
 'arnage' 'aspen' 'aspire' 'astro' 'ats' 'aurora' 'avalanche' 'avalon'
 'avenger' 'aventador' 'avenue' 'aveo

In [29]:
# Normalized count occurrence
bow_matrix_dense = bow_matrix.toarray()
norms = np.linalg.norm(bow_matrix_dense, axis=1)
normalized_bow_matrix = bow_matrix_dense / norms[:, None]

print("\nBag-of-Words (Normalized count occurrence):")
print(normalized_bow_matrix)



Bag-of-Words (Normalized count occurrence):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


  normalized_bow_matrix = bow_matrix_dense / norms[:, None]


In [30]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Model'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print("\nTF-IDF:")
print(tfidf_matrix.toarray())
print("Feature names:", tfidf_feature_names)


TF-IDF:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
Feature names: ['10' '100' '124' '12c' '15' '150' '1500' '1500hd' '16' '190' '200' '200h'
 '200sx' '200t' '240' '240sx' '250' '2500' '250h' '2x' '300' '3000gt'
 '300h' '300m' '300zx' '323' '330' '350' '350z' '360' '370z' '400' '400h'
 '420' '430' '450' '450h' '456m' '458' '460' '470' '4c' '4runner' '4x'
 '50' '500' '500e' '500l' '500x' '550' '560' '57' '570' '570s' '575m'
 '599' '600' '6000' '600h' '612' '62' '626' '650s' '718' '740' '760' '780'
 '7x' '80' '850' '86' '90' '900' '9000' '911' '928' '929' '940' '944'
 '960' '968' 'a3' 'a4' 'a5' 'a6' 'a7' 'a8' 'acadia' 'accent' 'acclaim'
 'accord' 'achieva' 'activehybrid' 'aerio' 'aerostar' 'alero' 'allante'
 'allroad' 'alltrack' 'alpina' 'altima' 'am' 'amanti' 'amg' 'and' 'armada'
 'arnage' 'aspen' 'aspire' 'astro' 'ats' 'aurora' 'avalanche' 'avalon'
 'avenger' 'aventador' 'av

In [31]:
# Word2Vec
# Tokenize the data
tokenized_data = [model.split() for model in data['Model']]

In [32]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, workers=4)


In [33]:
# Get Word2Vec embeddings
word2vec_embeddings = [np.mean([word2vec_model.wv[token] for token in model], axis=0) for model in tokenized_data]
print("\nWord2Vec Embeddings:")
print(np.array(word2vec_embeddings))


Word2Vec Embeddings:
[[-0.00685435  0.0011278   0.00170766 ... -0.00022534 -0.00811109
  -0.00051412]
 [-0.00623394  0.00365357  0.00103452 ... -0.0020861  -0.01083789
   0.00341242]
 [-0.00623394  0.00365357  0.00103452 ... -0.0020861  -0.01083789
   0.00341242]
 ...
 [-0.00059128 -0.00600316 -0.00509228 ... -0.00755747 -0.00340728
  -0.00695989]
 [-0.00059128 -0.00600316 -0.00509228 ... -0.00755747 -0.00340728
  -0.00695989]
 [-0.00980532  0.00137023 -0.00658715 ... -0.00082586 -0.00216137
   0.00255175]]
