In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from sentence_transformers import SentenceTransformer
from InstructorEmbedding import INSTRUCTOR
import os

# Move to parent directory
os.chdir("..")

from data.speeches import Speeches

In [5]:
speeches = Speeches()

In [9]:
def load_models(speech_text: pd.Series):
    all_models = {
        'tfidf': TfidfVectorizer(max_features=1000, stop_words="english", ngram_range=(1, 4)).fit(speech_text),
        'word2vec': spacy.load('en_core_web_sm'),
        'sentence2vec': SentenceTransformer('all-MiniLM-L6-v2'),
        'instructor': INSTRUCTOR('hkunlp/instructor-base')
    }
    return all_models

In [10]:
models = load_models(speeches.speeches_long['text'])

load INSTRUCTOR_Transformer
max_seq_length  512


In [11]:
speech_text = speeches.speeches_long['text']

In [26]:
def get_tfidf_features(speech_text: pd.Series):
    tfidf_features = models['tfidf'].transform(speech_text).toarray()
    tfidf_features = pd.DataFrame(tfidf_features, columns=models['tfidf'].get_feature_names_out())
    return tfidf_features

def get_word2vec_features(speech_text: pd.Series):
    word2vec_features = np.array([models['word2vec'](speech).vector for speech in speech_text])
    print(word2vec_features.shape)
    num_cols = word2vec_features.shape[1]
    word2vec_features = pd.DataFrame(word2vec_features, columns=[f"word2vec_{i}" for i in range(num_cols)])
    return word2vec_features

def get_sentence2vec_features(speech_text: pd.Series):
    sentence2vec_features = models['sentence2vec'].encode(speech_text)
    num_cols = sentence2vec_features.shape[1]
    sentence2vec_features = pd.DataFrame(sentence2vec_features, columns=[f"sentence2vec_{i}" for i in range(num_cols)])
    return sentence2vec_features

def get_instructor_features(speech_text: pd.Series):
    instruction = "Represent the presidential speech:"
    instructor_features = models['instructor'].encode([[instruction, x] for x in speech_text])
    num_cols = instructor_features.shape[1]
    instructor_features = pd.DataFrame(instructor_features, columns=[f"instructor_{i}" for i in range(num_cols)])
    return instructor_features

def get_features(speech_text, long=True, feature_type="tfidf", **kwargs):
    if feature_type == "tfidf":
        features = get_tfidf_features(speech_text)
    elif feature_type == "word2vec":
        features = get_word2vec_features(speech_text)
    elif feature_type == "sentence2vec":
        features = get_sentence2vec_features(speech_text)
    elif feature_type == "instructor":
        features = get_instructor_features(speech_text)
    else:
        raise ValueError("feature_type must be one of tfidf, word2vec, sentence2vec, or instructor")

    return features

In [27]:
get_features(speech_text[:10], feature_type='word2vec')

(10,)


  import sys


IndexError: tuple index out of range