# Applying TfidfVectorizer

### Import data and packages:

In [55]:
#import data frames from previous notebook

%store -r textdata

%store -r titledata

In [56]:
#import packages

import pandas as pd 

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split


### Apply TfidfVectorizer to text column:

In [57]:
# Corpus Selection
x_text = textdata.text.astype(str)

# Target Selection
y_text = textdata.label

In [58]:
#split text data into training and test sets 
x_text_train, x_text_test, y_text_train, y_text_test = train_test_split(
    x_text, y_text,
    test_size=0.3,
    random_state=42
)

In [59]:
#Apply vectorizer


vectorizer = TfidfVectorizer(min_df=5, max_features=15000, max_df=0.8)
vectorizer.fit(x_text_train)


In [60]:
x_text_train = vectorizer.transform(x_text_train)
x_text_test = vectorizer.transform(x_text_test)

In [61]:
textfeatures = vectorizer.get_feature_names_out()

In [62]:
print(textfeatures)

['aa' 'aapl' 'aaron' ... 'zulu' 'zuma' 'zurich']


In [63]:
for i in textfeatures:
    print(i, end = '\n')

aa
aapl
aaron
aaronkleinshow
ab
ababa
aback
abadi
abandon
abandonment
abate
abbas
abbasi
abbott
abby
abc
abd
abdel
abdeslam
abdicate
abdication
abdomen
abduct
abduction
abdul
abdullah
abe
abedi
abedin
abedini
abet
abfalecbaldwin
abhorrent
abide
abiding
ability
abject
ablaze
able
abnormal
abnormality
aboard
abolish
abolition
abomination
aboriginal
aborigine
abort
abortion
abound
abraham
abramovic
abrams
abroad
abrupt
abruptly
absence
absent
absentee
absentia
absolute
absolutely
absolve
absorb
abstain
abstention
abstract
absurd
absurdity
abu
abuja
abundance
abundant
abundantly
abuse
abuser
abusive
aby
abysmal
ac
aca
academia
academic
academy
accelerate
accent
accept
acceptable
acceptance
access
accessible
accession
accessory
accident
accidental
accidentally
acclaim
accommodate
accommodation
accompany
accomplice
accomplish
accomplishment
accord
accordance
accordingly
account
accountability
accountable
accountant
accredit
accrue
accumulate
accumulation
accuracy
accurate
accurately
accusati

### Apply TfidfVectorizer to title column:

In [64]:
# Corpus Selection
x_title = titledata.title.astype(str)

# Target Selection
y_title = titledata.label

In [65]:
#split title data into training and test sets 
x_title_train, x_title_test, y_title_train, y_title_test = train_test_split(
    x_title, y_title,
    test_size=0.3,
    random_state=42
)

In [66]:
# Apply vectorizer
vectorizer2 = TfidfVectorizer(min_df=2, max_df=0.6)
vectorizer2.fit(x_title_train)


In [67]:
x_title_train = vectorizer2.transform(x_title_train)
x_title_test = vectorizer2.transform(x_title_test)

In [68]:
titlefeatures = vectorizer2.get_feature_names_out()

In [69]:
for i in titlefeatures:
    print(i, end = '\n')

aaron
abadi
abandon
abbas
abbott
abby
abc
abdeslam
abdicate
abdication
abduct
abduction
abdullah
abe
abedin
abide
ability
able
aboard
abolish
abolition
aboriginal
abort
abortion
abound
abraham
abrams
abroad
abrupt
abruptly
absence
absentee
absentia
absolute
absolutely
abstain
absurd
absurdity
absurdly
abu
abuse
abuser
abusive
aby
aca
academic
academy
accelerate
accent
accept
acceptable
acceptance
access
accession
accident
accidental
accidentally
accompany
accomplice
accomplish
accomplishment
accord
account
accountability
accountable
accountant
accurate
accurately
accusation
accuse
accuser
achieve
achievement
acid
acknowledge
aclu
acosta
acquire
acquisition
acquit
acquittal
acre
across
act
action
activate
active
actively
activism
activist
activity
actor
actress
actual
actually
ad
adam
adamant
adapt
add
addict
addiction
additional
address
adele
adelson
aden
adhd
adjust
adjustment
admin
administration
administrative
administrator
admiral
admire
admission
admit
adopt
adoption
adorable
ador

### Create and store variables for later use:

In [70]:
x_text_train_tf, x_text_test_tf, y_text_train_tf, y_text_test_tf = x_text_train, x_text_test, y_text_train, y_text_test

In [71]:
%store x_text_train_tf
%store x_text_test_tf
%store y_text_train_tf
%store y_text_test_tf

Stored 'x_text_train_tf' (csr_matrix)
Stored 'x_text_test_tf' (csr_matrix)
Stored 'y_text_train_tf' (Series)
Stored 'y_text_test_tf' (Series)


In [72]:
x_title_train_tf, x_title_test_tf, y_title_train_tf, y_title_test_tf = x_title_train, x_title_test, y_title_train, y_title_test

In [73]:
%store x_title_train_tf
%store x_title_test_tf
%store y_title_train_tf
%store y_title_test_tf

Stored 'x_title_train_tf' (csr_matrix)
Stored 'x_title_test_tf' (csr_matrix)
Stored 'y_title_train_tf' (Series)
Stored 'y_title_test_tf' (Series)
