# Applying TfidfVectorizer

### Import data and packages:

In [45]:
#import data frames from previous notebook

%store -r textdata

%store -r titledata

In [46]:
#import packages

import pandas as pd 

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split


### Apply TfidfVectorizer to text column:

In [47]:
# Corpus Selection
x_text = textdata.text.astype(str)

# Target Selection
y_text = textdata.label

In [48]:
#split text data into training and test sets 
x_text_train, x_text_test, y_text_train, y_text_test = train_test_split(
    x_text, y_text,
    test_size=0.3,
    random_state=42
)

In [49]:
# Apply vectorizer (parameters to be tuned)


vectorizer = TfidfVectorizer(min_df=10)
vectorizer.fit(x_text_train)


In [50]:
x_text_train = vectorizer.transform(x_text_train)
x_text_test = vectorizer.transform(x_text_test)

In [51]:
textfeatures = vectorizer.get_feature_names_out()

In [52]:
print(textfeatures)

['aa' 'aaa' 'aapl' ... 'zulu' 'zuma' 'zurich']


In [53]:
for i in textfeatures:
    print(i, end = '\n')

aa
aaa
aapl
aaron
aaronkleinshow
aarp
ab
aba
abaaoud
ababa
aback
abadi
abandon
abandonment
abate
abaya
abbas
abbasi
abbe
abbey
abbott
abbreviate
abbreviation
abby
abc
abcpolitics
abd
abdallah
abdel
abdelhamid
abdeslam
abdi
abdicate
abdication
abdirahman
abdomen
abdominal
abduct
abduction
abdul
abdulaziz
abdullah
abdullahi
abdulrahman
abe
abed
abedi
abedin
abedini
abel
abella
abenomics
aberdeen
aberration
abet
abhor
abhorrent
abi
abid
abide
abiding
abidjan
abigail
ability
abject
ablaze
able
abm
abnormal
abnormality
abnormally
aboard
abolish
abolition
abolitionist
abominable
abomination
aboriginal
abort
abortion
abortionist
abortive
abou
abound
about
above
abraham
abramovic
abramowitz
abrams
abrasive
abridge
abroad
abrogate
abrupt
abruptly
abscond
absence
absent
absentee
absentia
absolute
absolutely
absolutist
absolve
absorb
absorbed
absorption
abstain
abstention
abstinence
abstract
abstraction
absurd
absurdity
absurdly
abu
abuja
abul
abundance
abundant
abundantly
abuse
abuser
abusive
ab

In [54]:
docterm = pd.DataFrame(x_text_train.todense(), columns=textfeatures)

In [55]:
docterm

Unnamed: 0,aa,aaa,aapl,aaron,aaronkleinshow,aarp,ab,aba,abaaoud,ababa,...,zoom,zoomph,zor,zucker,zuckerberg,zuesse,zulia,zulu,zuma,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.042498,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
40983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
40984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
40985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


### Apply TfidfVectorizer to title column:

In [56]:
# Corpus Selection
x_title = titledata.title.astype(str)

# Target Selection
y_title = titledata.label

In [57]:
#split title data into training and test sets 
x_title_train, x_title_test, y_title_train, y_title_test = train_test_split(
    x_title, y_title,
    test_size=0.3,
    random_state=42
)

In [58]:
# Apply vectorizer
vectorizer2 = TfidfVectorizer(min_df=10)
vectorizer2.fit(x_title_train)


In [59]:
x_title_train = vectorizer2.transform(x_title_train)
x_title_test = vectorizer2.transform(x_title_test)

In [60]:
titlefeatures = vectorizer2.get_feature_names_out()

In [64]:
for i in titlefeatures:
    print(i, end = '\n')

abadi
abandon
abbas
abc
abduct
abe
abedin
ability
able
abortion
abroad
absolute
absolutely
absurd
abuse
academic
academy
accent
accept
access
accident
accidentally
accord
account
accountable
accusation
accuse
accuser
acknowledge
aclu
acquit
across
act
action
active
activist
activity
actor
actress
actual
actually
ad
adam
add
addict
additional
address
admin
administration
admiral
admission
admit
adopt
adult
advance
advantage
advice
advise
adviser
advisor
advisory
advocate
affair
affect
afford
affordable
afghan
afghanistan
afraid
africa
african
ag
age
agency
agenda
agent
aggression
aggressive
ago
agree
agreement
agriculture
ahead
aid
aide
ail
aim
air
airbnb
aircraft
airline
airport
airstrike
airstrikes
akbar
al
alabama
alarm
alaska
alec
aleppo
alert
alex
ali
alien
alive
all
allah
allahu
allegation
allege
allegedly
allegiance
allen
alliance
allow
ally
almost
alone
along
already
also
alt
alter
alternative
always
amaze
amazon
ambassador
ambush
amend
amendment
america
american
amid
ammon
amne

In [65]:
docterm2 = pd.DataFrame(x_title_train.todense(), columns=titlefeatures)

In [66]:
docterm2.head(n=20)

Unnamed: 0,abadi,abandon,abbas,abc,abduct,abe,abedin,ability,able,abortion,...,zealand,zero,zika,zimbabwe,zimbabwean,zionist,zone,zor,zuckerberg,zuma
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create and store variables for later use:

In [67]:
x_text_train_tf, x_text_test_tf, y_text_train_tf, y_text_test_tf = x_text_train, x_text_test, y_text_train, y_text_test

In [68]:
%store x_text_train_tf
%store x_text_test_tf
%store y_text_train_tf
%store y_text_test_tf

Stored 'x_text_train_tf' (csr_matrix)
Stored 'x_text_test_tf' (csr_matrix)
Stored 'y_text_train_tf' (Series)
Stored 'y_text_test_tf' (Series)


In [69]:
x_title_train_tf, x_title_test_tf, y_title_train_tf, y_title_test_tf = x_title_train, x_title_test, y_title_train, y_title_test

In [70]:
%store x_title_train_tf
%store x_title_test_tf
%store y_title_train_tf
%store y_title_test_tf

Stored 'x_title_train_tf' (csr_matrix)
Stored 'x_title_test_tf' (csr_matrix)
Stored 'y_title_train_tf' (Series)
Stored 'y_title_test_tf' (Series)
