In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text  import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics  import f1_score,accuracy_score

In [13]:
train_set = pd.read_csv("dataset_train_pp.csv")
test_set = pd.read_csv("dataset_test_pp.csv")

In [3]:
print(len(train_set))
print(len(test_set))

50000
5000


In [14]:
# Separating input and label
train_x=train_set["Description"]
test_x=test_set["Description"]

train_y=train_set["Class Index"]
test_y=test_set["Class Index"]

### TF-IDF

In [5]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# TF-IDF vectorizer from sklearn
# min_df: ignore terms that have a document frequency lower than min_df
# max_features: build a vocabulary that only consider the top max_features ordered by term frequency.
tfidf = TfidfVectorizer(max_features=8000, min_df=10)

In [6]:
%%time
train_vec = tfidf.fit_transform(train_x) # learn vectorization from train set
test_vec = tfidf.transform(test_x)       # vectorize the test set, vectorizer is already learned

CPU times: user 5.53 s, sys: 92.8 ms, total: 5.62 s
Wall time: 5.63 s


In [7]:
print("No. of features:", len(tfidf.get_feature_names()))
print(tfidf.get_feature_names()[:10])
print(tfidf.get_feature_names()[-10:])

No. of features: 8000
['aa', 'aaron', 'ab', 'abandon', 'abandoned', 'abbas', 'abbey', 'abbott', 'abc', 'abducted']
['zhong', 'zhou', 'zieminski', 'ziff', 'zimbabwe', 'zinc', 'zone', 'zoo', 'zoom', 'zurich']


In [8]:
# sparce matrix to np array (throws memory error with large dataset)
tfidf_train_x = train_vec.toarray()
tfidf_test_x = test_vec.toarray()

#### save TF-IDF matrix to fil

In [11]:
np.save("tfidf_train_x.npy", tfidf_train_x)

In [12]:
np.save("tfidf_test_x.npy", tfidf_test_x)

In [3]:
tfidf_test_x = np.load("tfidf_test_x.npy")

In [2]:
tfidf_train_x = np.load("tfidf_train_x.npy")

In [7]:
tfidf_test_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
print(type(tfidf_test_x))
print(len(tfidf_test_x))
print(len(tfidf_test_x[0]))
print(tfidf_test_x.shape)

<class 'numpy.ndarray'>
5000
8000
(5000, 8000)
