## ***Experiment 3 :  Feature Extraction - Introduction to Word Vectorization (One Hot Encoding, Bag of Words(BOW), Count Vectorizer, TF-IDF, Word2Vec,FastText, GloVe)***
<hr>

**Name:** Aayusha Bhatia (22070122004), Ayan Jain (22070122040)  
**Lab:** NLP Lab-1

In [1]:
!pip install gensim



In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from gensim.models import Word2Vec, FastText, KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv("dataset.csv")
text_columns = ['artists', 'album_name', 'track_name', 'track_genre']

# text preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [stemmer.stem(w) for w in tokens]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

for col in text_columns:
    df[col + "_clean"] = df[col].apply(preprocess_text)

# Combine all cleaned text into one column for feature extraction
df['combined_text'] = df[[c + "_clean" for c in text_columns]].agg(' '.join, axis=1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# ONE HOT ENCODING
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
one_hot_encoded = ohe.fit_transform(df[['track_genre']])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=ohe.get_feature_names_out(['track_genre']))
print("\nüìå ONE HOT ENCODING")
print("Shape:", one_hot_df.shape)
print("Feature Names:", one_hot_df.columns.tolist()[:10], "...")
print(one_hot_df.head(5))

# BAG OF WORDS (BOW)
cv = CountVectorizer()
bow_matrix = cv.fit_transform(df['combined_text'])
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=cv.get_feature_names_out())
print("\nüìå BAG OF WORDS")
print("Shape:", bow_df.shape)
print("Feature Names:", cv.get_feature_names_out()[:10], "...")
print(bow_df.head(5))

# COUNT VECTORIZER

count_vec = CountVectorizer(min_df=2)
count_matrix = count_vec.fit_transform(df['combined_text'])
count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vec.get_feature_names_out())
print("\nüìå COUNT VECTORIZER (min_df=2)")
print("Shape:", count_df.shape)
print("Feature Names:", count_vec.get_feature_names_out()[:10], "...")
print(count_df.head(5))

# TF-IDF
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(df['combined_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vec.get_feature_names_out())
print("\nüìå TF-IDF")
print("Shape:", tfidf_df.shape)
print("Feature Names:", tfidf_vec.get_feature_names_out()[:10], "...")
print(tfidf_df.head(5))

# WORD2VEC
tokenized_texts = [t.split() for t in df['combined_text']]
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
print("\nüìå WORD2VEC")
print("Vocabulary Size:", len(w2v_model.wv))
print("Sample Words:", list(w2v_model.wv.index_to_key)[:10], "...")
print("Vector for first word:\n", w2v_model.wv[list(w2v_model.wv.index_to_key)[0]])

# FASTTEXT
fasttext_model = FastText(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
print("\nüìå FASTTEXT")
print("Vocabulary Size:", len(fasttext_model.wv))
print("Sample Words:", list(fasttext_model.wv.index_to_key)[:10], "...")
print("Vector for first word:\n", fasttext_model.wv[list(fasttext_model.wv.index_to_key)[0]])

# GloVe
try:
    glove_path = "glove.6B.100d.txt"
    glove_model = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_model[word] = vector
    print("\nüìå GloVe")
    print("Vocabulary Size:", len(glove_model))
    sample_words = list(glove_model.keys())[:10]
    print("Sample Words:", sample_words, "...")
    print("Vector for first word:\n", glove_model[sample_words[0]])
except FileNotFoundError:
    print("\n‚ö†Ô∏è GloVe file not found. Please download glove.6B.100d.txt")

df.to_csv("saved_dataset.csv", index=False)
print("\n‚úÖ Preprocessing & Feature Extraction Done. Saved to saved_dataset.csv")



üìå ONE HOT ENCODING
Shape: (114000, 114)
Feature Names: ['track_genre_acoustic', 'track_genre_afrobeat', 'track_genre_alt-rock', 'track_genre_alternative', 'track_genre_ambient', 'track_genre_anime', 'track_genre_black-metal', 'track_genre_bluegrass', 'track_genre_blues', 'track_genre_brazil'] ...
   track_genre_acoustic  track_genre_afrobeat  track_genre_alt-rock  \
0                   1.0                   0.0                   0.0   
1                   1.0                   0.0                   0.0   
2                   1.0                   0.0                   0.0   
3                   1.0                   0.0                   0.0   
4                   1.0                   0.0                   0.0   

   track_genre_alternative  track_genre_ambient  track_genre_anime  \
0                      0.0                  0.0                0.0   
1                      0.0                  0.0                0.0   
2                      0.0                  0.0              