# Create Vectorized Text Matrices Files

<div class="alert alert-block alert-danger">
<b>Future Usage of Vectorized Matrices Files Unadvisable</b><br>
Future models should implement vectorizors that implement max_feature reduction during construction to prevent future overdetermination</div>

In [1]:
# LIBRARIES

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
import scipy.sparse
import spacy

import warnings
warnings.filterwarnings('ignore')

<br><br>

In [33]:
# DATA IMPORT

data = pd.read_csv('../data/cleaned/expanded_mbti_df.csv')

In [34]:
data.columns

Index(['type', 'posts', 'comp_score', 'neg_score', 'neu_score', 'pos_score',
       'post_count', 'avg_word_count', 'posts_cleaned', 'cleaned_comp_score',
       'cleaned_neg_score', 'cleaned_neu_score', 'cleaned_pos_score',
       'post_count_cleaned', 'avg_word_count_cleaned', 'posts_no_digits',
       'post_count_no_digits', 'avg_word_count_no_digits', 'posts_no_punct',
       'no_punct_comp_score', 'no_punct_neg_score', 'no_punct_neu_score',
       'no_punct_pos_score', 'diff_post_count_init-cleaned',
       'diff_word_count_init-cleaned', 'diff_post_count_cleaned-no_digits',
       'diff_word_count_cleaned-no_digits', 'diff_post_count_init-no_digits',
       'diff_word_count_init-no_digits', 'diff_comp_init-clean',
       'diff_comp_clean-no_punct', 'diff_comp_init-no_punct', 'E_I', 'N_S',
       'F_T', 'J_P', 'E_I_code', 'N_S_code', 'F_T_code', 'J_P_code',
       'type_code'],
      dtype='object')

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 41 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   type                               8675 non-null   object 
 1   posts                              8675 non-null   object 
 2   comp_score                         8675 non-null   float64
 3   neg_score                          8675 non-null   float64
 4   neu_score                          8675 non-null   float64
 5   pos_score                          8675 non-null   float64
 6   post_count                         8675 non-null   int64  
 7   avg_word_count                     8675 non-null   int64  
 8   posts_cleaned                      8675 non-null   object 
 9   cleaned_comp_score                 8675 non-null   float64
 10  cleaned_neg_score                  8675 non-null   float64
 11  cleaned_neu_score                  8675 non-null   float

In [30]:
with open('../data/full_stopwords.txt', 'r') as filehandle:
    custom_stopwords = [words.rstrip() for words in filehandle.readlines()]

In [74]:
def custom_tokenizer(text):
    nlp = spacy.load('en_core_web_lg')
    text = nlp(text)
    lemma_text = [word.lemma_ for word in text if not word.is_punct]
    return lemma_text

<br><br>

### Create CountVectorized matrix without stopwords

In [77]:
cv = CountVectorizer(tokenizer=custom_tokenizer, stop_words = custom_stopwords)

In [79]:
cv_vect = cv.fit_transform(data['posts_no_punct'])

In [80]:
scipy.sparse.save_npz('countvect_matrix_lemma.npz', cv_vect)

<br><br>

### Create TFIDF-Vectorized matrix without stopwords

In [81]:
tfidf = TfidfVectorizer(tokenizer=custom_tokenizer, stop_words = custom_stopwords)

In [82]:
tfidf_vect = tfidf.fit_transform(data['posts_no_punct'])

In [83]:
scipy.sparse.save_npz('tfidfvect_matrix_lemma.npz', tfidf_vect)

<br><br>

### Create Spacy-WordEmbedded Vectorized matrix without stopwords

In [22]:
from sklearn.base import BaseEstimator, TransformerMixin
from spacy.language import Language

In [23]:
@Language.component("stopwords_component")
def custom_component(doc):
    nlp = spacy.load("en_core_web_lg")
    results = [word.text.lower() for word in doc if word.text.lower() not in custom_stopwords]    
    doc = nlp(" ".join(results))
    return doc

In [24]:
class WordVectorTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, model = 'en_core_web_lg'):
        self.model = model
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        nlp = spacy.load(self.model)
        nlp.add_pipe('stopwords_component', after='tok2vec')
        return np.concatenate([nlp(doc).vector.reshape(1, -1) for doc in X])

In [37]:
wvt = WordVectorTransformer()

In [38]:
wvt_vect = wvt.fit(data['posts_no_punct'])

In [None]:
wvt_vect = wvt.transform(data['posts_no_punct'])

In [None]:
scipy.sparse.save_npz('spacyvect_matrix_lemma.npz', wvt_vect)