In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

In [2]:
articles = pd.read_csv('./data/articles_clean.csv')
articles_lemma = pd.read_csv('./data/articles_lemmatized.csv')

In [3]:
stopwords_list = stopwords.words('german') # are there other languages in text

In [4]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 3), 
                     min_df=0.01,
                     max_df=1.0,
                     max_features=5000,
                     stop_words=stopwords_list)


In [5]:
def vectorize(vectorizer, data):
    matrix  = vectorizer.fit_transform(data)
    feature_names = vectorizer.get_feature_names()
    return matrix, np.array(feature_names)

In [6]:
# vectorization
matrix, feat_names = vectorize(tfidf_vectorizer, articles['content'])
matrix_lemma, feat_names_lemma = vectorize(tfidf_vectorizer, articles['content'])

In [7]:
# saving matrices to files
sparse.save_npz("./vectorized/vec_matrix.npz", matrix)
sparse.save_npz("./vectorized/vec_matrix_lemma.npz", matrix_lemma)

In [8]:
# saving feature names to files
np.save('./vectorized/vec_names.npy', feat_names) 
np.save('./vectorized/vec_names_lemma.npy', feat_names_lemma) 