# Setup

## Imports

In [1]:
# General
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import os
import sys
# Add the path to the utils folder
sys.path.append(os.path.abspath('../..'))
import importlib
# Custom modules
from utils import memory_usage, save_pickle, load_pickle, save_np, load_np, file_exists, save_parquet, load_parquet
from config import run_config, token_pattern, MODELS_PATH, FEATURES_PATH, PROCESSED_DATA_PATH
importlib.reload(sys.modules['utils'])
importlib.reload(sys.modules['config'])

<module 'config' from 'e:\\College\\4- Senior 2\\Semester 1\\NLP\\Project\\config.py'>

## Config

In [3]:
run_config()

In [4]:
df_train = pd.read_parquet(PROCESSED_DATA_PATH + '/train_preprocessing.parquet')

# TF-IDF

In [6]:
update_feature = False
if update_feature or not file_exists(FEATURES_PATH + "/tfidf_features.npy") or not file_exists(MODELS_PATH + "/tfidf_vectorizer.pkl"):
    # TF-IDF feature extraction
    vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 1), token_pattern=token_pattern)
    tfidf_features = vectorizer.fit_transform(df_train["src"])    
    tfidf_embeddings = {word: tfidf_features[:, vectorizer.vocabulary_.get(word, 0)].toarray().reshape(-1) for word in vectorizer.get_feature_names_out()}
    save_pickle(MODELS_PATH + "/tfidf_vectorizer.pkl", vectorizer)
    save_np("tfidf_features.npy", tfidf_features, type="feature")
    # save_pickle(FEATURES_PATH + "/tfidf_embeddings.pkl", tfidf_embeddings)
    save_parquet(FEATURES_PATH + "/tfidf_embeddings.parquet", pd.DataFrame(tfidf_embeddings))
else:
    vectorizer = load_pickle(MODELS_PATH + "/tfidf_vectorizer.pkl")
    tfidf_features = load_np("tfidf_features.npy", type="feature").tolist()
    # tfidf_embeddings = load_pickle(FEATURES_PATH + "tfidf_embeddings.pkl")
    tfidf_embeddings = load_parquet(FEATURES_PATH + "/tfidf_embeddings.parquet").to_dict(orient="list")

In [7]:
tfidf_embeddings

{'1': array([0., 0., 0., ..., 0., 0., 0.]),
 '10': array([0., 0., 0., ..., 0., 0., 0.]),
 '11': array([0., 0., 0., ..., 0., 0., 0.]),
 '12': array([0., 0., 0., ..., 0., 0., 0.]),
 '13': array([0., 0., 0., ..., 0., 0., 0.]),
 '14': array([0., 0., 0., ..., 0., 0., 0.]),
 '15': array([0., 0., 0., ..., 0., 0., 0.]),
 '16': array([0., 0., 0., ..., 0., 0., 0.]),
 '2': array([0., 0., 0., ..., 0., 0., 0.]),
 '20': array([0., 0., 0., ..., 0., 0., 0.]),
 '200': array([0., 0., 0., ..., 0., 0., 0.]),
 '200-milliliter': array([0., 0., 0., ..., 0., 0., 0.]),
 '3': array([0., 0., 0., ..., 0., 0., 0.]),
 '4': array([0., 0., 0., ..., 0., 0., 0.]),
 '5': array([0., 0., 0., ..., 0., 0., 0.]),
 '500': array([0., 0., 0., ..., 0., 0., 0.]),
 '500-milliliter': array([0., 0., 0., ..., 0., 0., 0.]),
 '500-ml': array([0., 0., 0., ..., 0., 0., 0.]),
 '6': array([0., 0., 0., ..., 0., 0., 0.]),
 '7': array([0., 0., 0., ..., 0., 0., 0.]),
 '8': array([0., 0., 0., ..., 0., 0., 0.]),
 '9': array([0., 0., 0., ..., 0.,

In [16]:
tfidf_features.shape

(2100467, 355)

In [17]:
# vocab = vectorizer.get_feature_names_out()
# docterm = pd.DataFrame(tfidf_features.todense(), columns=vocab)

In [18]:
vectorizer.vocabulary_["i'd"]

141

In [19]:
tfidf_features[:, vectorizer.vocabulary_.get("you", 0)].toarray().reshape(-1)

array([0., 0., 0., ..., 0., 0., 0.])

In [20]:
memory_usage()

5283.5859375