In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import os
import numpy as np

In [13]:
def save_vectors(train_vectors, dev_vectors, test_vectors, train_name, dev_name, test_name, save_path):
    np.save(os.path.join(save_path, train_name), train_vectors)
    np.save(os.path.join(save_path, dev_name), dev_vectors)
    np.save(os.path.join(save_path, test_name), test_vectors)

## Loading Data

In [2]:
parent_dir = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
folder_path = 'Data\PreprocessedData'
net_path = os.path.join(parent_dir, folder_path)

In [3]:
df_train_english = pd.read_csv(os.path.join(net_path, 'english_train_preprocess.csv'))
df_dev_english = pd.read_csv(os.path.join(net_path, 'english_dev_preprocess.csv'))
df_test_english = pd.read_csv(os.path.join(net_path, 'english_test_preprocess.csv'))

In [4]:
train_english_sentences = df_train_english['preprocessed_text'].tolist()
dev_english_sentences = df_dev_english['preprocessed_text'].tolist()
test_english_sentences = df_test_english['preprocessed_text'].tolist()

### TF-IDF

In [5]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(train_english_sentences)

train_english_tfidf = tfidf_vectorizer.transform(train_english_sentences)
dev_english_tfidf = tfidf_vectorizer.transform(dev_english_sentences)
test_english_tfidf = tfidf_vectorizer.transform(test_english_sentences)

In [6]:
train_english_tfidf = train_english_tfidf.toarray()
dev_english_tfidf = dev_english_tfidf.toarray()
test_english_tfidf = test_english_tfidf.toarray()

### PCA

In [11]:
# PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=1000)
pca.fit(train_english_tfidf)

train_english_tfidf_pca = pca.transform(train_english_tfidf)
dev_english_tfidf_pca = pca.transform(dev_english_tfidf)
test_english_tfidf_pca = pca.transform(test_english_tfidf)

In [14]:
save_path = os.path.join(parent_dir, 'Word_Embeddings\Pre Computed Word Embeddings')

In [15]:
save_vectors(train_english_tfidf_pca, dev_english_tfidf_pca, test_english_tfidf_pca, 'train_english_tfidf_pca.npy', 'dev_english_tfidf_pca.npy', 'test_english_tfidf_pca.npy', save_path)

In [16]:
save_vectors(train_english_tfidf, dev_english_tfidf, test_english_tfidf, 'train_english_tfidf.npy', 'dev_english_tfidf.npy', 'test_english_tfidf.npy', save_path)