# Applying TfidfVectorizer

### Import data and packages:

In [97]:
#import data frames from previous notebook

%store -r textdata

%store -r titledata

In [98]:
#import packages

import pandas as pd 

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split


### Apply TfidfVectorizer to text column:

In [99]:
# Corpus Selection
x_text = textdata.text.astype(str)

# Target Selection
y_text = textdata.label

In [100]:
#split text data into training and test sets 
x_text_train, x_text_test, y_text_train, y_text_test = train_test_split(
    x_text, y_text,
    test_size=0.3,
    random_state=42
)

In [102]:
# Apply vectorizer (parameters to be tuned)


vectorizer = TfidfVectorizer(min_df=10)
vectorizer.fit(x_text_train)


In [103]:
x_text_train = vectorizer.transform(x_text_train)
x_text_test = vectorizer.transform(x_text_test)

In [104]:
feature_names = vectorizer.get_feature_names_out()

In [105]:
print(feature_names)

['____' '_____' 'aa' ... 'zurich' 'zwei' 'zwischen']


In [121]:
vocab = vectorizer.get_feature_names_out()
docterm = pd.DataFrame(x_text_train.todense(), columns=vocab)

In [107]:
docterm

Unnamed: 0,____,_____,aa,aaa,aaplo,aaron,aaronkleinshow,aarp,ab,aba,...,zuckerbergs,zuesse,zulia,zulu,zum,zuma,zur,zurich,zwei,zwischen
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Apply TfidfVectorizer to title column:

In [108]:
# Corpus Selection
x_title = titledata.title.astype(str)

# Target Selection
y_title = titledata.label

In [109]:
#split title data into training and test sets 
x_title_train, x_title_test, y_title_train, y_title_test = train_test_split(
    x_title, y_title,
    test_size=0.3,
    random_state=42
)

In [110]:
# Apply vectorizer
vectorizer2 = TfidfVectorizer(min_df=10)
vectorizer2.fit(x_title_train)


In [111]:
x_title_train = vectorizer2.transform(x_title_train)
x_title_test = vectorizer2.transform(x_title_test)

In [113]:
feature_names = vectorizer2.get_feature_names_out()

In [114]:
print(feature_names)

['abadi' 'abandon' 'abbas' ... 'zone' 'zuckerberg' 'zuma']


In [115]:
vocab2 = vectorizer2.get_feature_names_out()
docterm2 = pd.DataFrame(x_title_train.todense(), columns=vocab2)

In [116]:
docterm2.head(n=20)

Unnamed: 0,abadi,abandon,abbas,abc,abduct,abdullah,abe,abedin,able,abortion,...,youtube,yr,zealand,zero,zika,zimbabwe,zionist,zone,zuckerberg,zuma
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create and store variables for later use:

In [117]:
x_text_train_tf, x_text_test_tf, y_text_train_tf, y_text_test_tf = x_text_train, x_text_test, y_text_train, y_text_test

In [118]:
%store x_text_train_tf
%store x_text_test_tf
%store y_text_train_tf
%store y_text_test_tf

Stored 'x_text_train_tf' (csr_matrix)
Stored 'x_text_test_tf' (csr_matrix)
Stored 'y_text_train_tf' (Series)
Stored 'y_text_test_tf' (Series)


In [119]:
x_title_train_tf, x_title_test_tf, y_title_train_tf, y_title_test_tf = x_title_train, x_title_test, y_title_train, y_title_test

In [120]:
%store x_title_train_tf
%store x_title_test_tf
%store y_title_train_tf
%store y_title_test_tf

Stored 'x_title_train_tf' (csr_matrix)
Stored 'x_title_test_tf' (csr_matrix)
Stored 'y_title_train_tf' (Series)
Stored 'y_title_test_tf' (Series)
