# Dataset contains headlines and categories for over 400k news articles

Using different model to classify the headlines.Models are provided in Python by the scikit-learn library.

### Naive Bayes model

In [1]:
import pandas as pd 

# the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
# function to split the data for cross-validation
from sklearn.model_selection import train_test_split
# function for transforming documents into counts and remove stopwords.
from sklearn.feature_extraction.text import TfidfVectorizer
# function for encoding categories
from sklearn.preprocessing import LabelEncoder
# function for give recall,precesion,f1-score,support
from sklearn.metrics import classification_report
# function for using chi squared test
from sklearn.feature_selection import chi2,SelectKBest
# function to perform cross validation
from sklearn.model_selection import cross_val_score
#function to use KFold 
from sklearn.model_selection import KFold


In [2]:
data = pd.read_csv("news.csv")

data.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027



Removing punctuation and lowercase everything. This will give us a smaller set of words, which will decrease the size of our model, and ensure that words are treated the same even if they occur capitalized at the beginning of the headline or lowercase in the middle.



In [3]:

tf = TfidfVectorizer(stop_words = 'english', 
                     token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b')

x = tf.fit_transform(data['TITLE'])


In [26]:
#print(x[1:100])
tf.vocabulary_

txt_fitted = tf.fit(data['TITLE'])

idf = tf.idf_

print(dict(zip(txt_fitted.get_feature_names(), idf)))



In [4]:
#using encoder to encode the label into interger
encoder = LabelEncoder()
y = encoder.fit_transform(data['CATEGORY'])


# split into train and test sets in 80:20 ratio
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#print(type(x.shape),set(y))
# take a look at the shape of each of these
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


(337935, 51642)
(337935,)
(84484, 51642)
(84484,)


In [5]:
#Making classifer
nb = MultinomialNB()
nb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
#calucation of the accuracy of the model
nb.score(x_test, y_test)

0.9284361535912126

In [10]:
#taking account of the perfomance of the model using classification report

x_test_pred = nb.predict(x_test)
print(classification_report(y_test, x_test_pred,target_names=encoder.classes_))

              precision    recall  f1-score   support

           b       0.90      0.92      0.91     23159
           e       0.96      0.97      0.96     30430
           m       0.97      0.86      0.91      9219
           t       0.91      0.91      0.91     21676

   micro avg       0.93      0.93      0.93     84484
   macro avg       0.93      0.91      0.92     84484
weighted avg       0.93      0.93      0.93     84484



In [40]:
#Finding most co-related term in each sample feature using chi squared test
ch2 = SelectKBest(chi2,k=1).fit_transform(x, y)


In [41]:
#fucntion to reverse the vectorized word back to its orignal form

def make_reverse_vocabulary(vectorizer):
    revvoc = {}

    vocab = vectorizer.vocabulary_
    for w in vocab:
        #i = vocab[w]
        print(w)
        #revvoc[i] = w

    #return revvoc

make_reverse_vocabulary(tf)

fed
official
says
weak
data
caused
weather
slow
taper
s
charles
plosser
sees
high
bar
change
pace
tapering
open
stocks
fall
hints
accelerated
risks
falling
curve
nasty
curbed
job
growth
accelerate
expects
unemployment
end
jobs
month
hit
president
ecb
unlikely
sterilisation
smp
purchases
traders
sterilization
eu
half
baked
bank
union
work
europe
reaches
crunch
point
banking
focus
stronger
euro
drowns
message
rates
low
aims
deal
tackling
failing
banks
forex
pound
drops
lows
noyer
strong
creates
unwarranted
economic
pressure
week
ahead
march
resolution
transparency
ukraine
member
kinds
tools
anxieties
wane
bunds
treasuries
spain
debt
rallies
economy
bad
loan
triggers
key
feature
test
announcement
sources
china
trade
deficit
structural
worries
main
issues
things
need
know
morning
happy
strength
update
eurozone
sovereign
exposure
hits
new
reveal
hurdles
zone
market
eur
usd
retreats
year
highs
seen
refile
requires
tougher
definition
tests
opening
bell
central
contain
threats
prices
threat
ri

apples
user
experience
hinted
jailbroken
xcode
upgrade
direct
links
images
gallery
ibettercharge
charge
home
discovered
code
chances
appear
slim
previously
vehicle
enable
hdr
5s
include
hyped
breaks
visuals
administrators
applaud
connected
strategy
interface
accessibility
pod2g
faq
extra
unleashes
owners
facetime
handy
immediately
sports
festival
perfect
installed
older
dials
meld
impressed
pandora
diss
maps
product
cycles
impress
wow
criticism
experts
raised
spat
spills
remote
browse
purchased
movies
airplay
environments
bonjour
blocked
hide
unwanted
channels
channel
firmware
11d169b
welcome
icon
clutter
cleaning
paralympic
easy
ability
hides
unwatched
kiwi
tops
nz
charts
fyi
manually
restart
hiding
added
revamped
concert
publishsosimply
publishing
magazine
replica
card
ohio
steady
rhode
island
gallon
aaa
honolulu
outlook
savannah
peoria
expert
refinery
maintenance
cause
spike
continuing
connecticut
montana
massachusetts
jump
climbing
harrisburg
national
rochester
bellingham
dime
musk

takeaways
jefferies
cautions
fy14
climbs
f4q
current
income
row
cautious
spdr
etf
powershares
qqq
wary
disturbing
budget
premarket
broadly
firmer
steadies
rupee
fx
steadier
footing
boj
shaken
disappointment
unable
derail
drift
gain
jitters
evening
lamphier
legs
debating
live5news
charleston
trim
grinds
fmc
yen
health
carney
select
rbs
england
overhauls
regulatory
independent
scotland
quizzed
rigging
libor
spotlight
started
qe
housing
extraordinary
shake
probes
suspended
alarm
appoint
deputy
governor
gbp
loose
destroys
tapes
meetings
extremely
boe
investigations
lender
resort
scots
independence
grilled
trims
remains
keeping
shades
nixon
ftse100
overheating
fixing
knowing
manipulation
lawmaker
handling
encouraging
ftse
bullish
explain
members
election
unreasonable
headquarters
conduct
authority
investigating
distinct
possibility
divided
output
stalls
voters
remarks
tighten
persist
ruthless
allegation
iditarod
mile
viable
relentlessly
maintain
fisher
hunting
wrongdoing
slipped
testified
s

piz
marshmallow
cheat
adulthood
cult
flair
infographic
echolls
silly
scott
pierce
cs
interviews
downfall
leather
feisty
wcw
diehard
satisfied
fandom
flashback
checking
tch
resurrecting
faithful
suspense
clued
gang
reprises
necessarily
roth
cornet
neptune
dirt
bros
originally
topics
explored
reviving
dvd
recording
bedroom
wows
nights
goth
gothic
glam
tortured
haircut
cosies
mandy
grading
backer
newbie
homecoming
tasering
luscious
arch
campbell
nancy
drew
hat
peeing
oscars
stuck
orbit
musicals
fandango
momentous
rave
marshmallows
thriller
wisecracking
recaptures
rogan
screenings
rooney
mara
pan
racial
tiger
lily
forthcoming
fairytale
prequel
jackman
wright
whitest
chick
coudt
whitewashing
lilly
opposite
garrett
hedlund
mcdill
similar
genuine
whines
wages
previous
owe
carla
rodriguez
mama
rundown
divorces
rycroft
disastrous
disasterous
girlfirend
jillian
unsatisfying
cheer
schools
dedicated
cdl
stirs
bullying
glad
hater
bass
forgives
feuding
tabloids
reigned
spectacle
speculate
uploads
ar

elusive
reiniers
wasserman
schultz
davie
shhh
desperation
mounts
webinar
pulse
extending
penalty
hall
sylvia
garcia
hispanics
mesa
column
bumpy
fla
hardship
broward
millennials
dooming
swedish
forefront
wellpoint
penalties
columbus
dispatch
misleadingly
prizes
lowcountry
hertz
equipment
rental
separated
renting
htz
brks
mksi
atec
naples
actiev
amzn
orcl
annex
seperated
separate
structure
traded
sirius
xm
buyback
hounews
spins
6c
separation
pistorius
gorbachev
yemen
aden
wal
mart
sampler
primary
kagan
happiest
herc
divestment
leasing
avis
cube
weakened
bulgaria
registrations
vw
amended
acea
imf
tucking
handle
broadbent
nemat
shafik
setter
appointments
ishares
msci
ewu
governors
appointment
stamps
radical
ceiling
guard
osborne
departure
institution
posen
values
haldane
cv
appointed
iou
imbalance
illusions
neutral
removed
attributed
sina
weibo
apology
spearhead
tue
12am
township
buick
enclave
cadillac
xts
traverse
gmc
czar
string
18m
airbag
boyer
additional
dealing
redoubles
assure
intens

threepenny
banknote
anticounterfeit
isis
worlds
mint
foil
fraudsters
thrupenny
counterfeiting
faking
aluminum
quids
counterfeit
modelled
unseen
shiny
hardest
essex
counterfeited
counterfeiters
fakers
distract
epa
forgery
trolleys
forgeries
inspiration
sounder
introductory
commemorate
bmw
diplomatic
pretax
digit
reithofer
deliveries
brooklyn
relies
lifted
bayerische
motoren
werke
cevallos
tardy
steroids
wagoner
accountable
contrite
dec
flaws
chiefs
dealership
represents
unsafe
afflicted
apologise
mea
culpa
counsel
friedman
talked
apologize
pacific
sightly
meeti
outcome
jpy
restrained
soothing
tokyo
aud
recede
softens
benchmark
nurses
ebbs
awaits
rba
waning
resilient
fxbeat
soothe
deepening
drifts
annexation
suga
bolstered
unanimous
quantative
borrowing
puzzled
progresses
unanimously
constant
fatally
flawed
reorganisation
bofe
rejigs
deck
eventful
reshuffle
brummer
reckless
fixation
brave
implementation
branch
assume
failures
fuses
departments
mckinsey
personnel
deputies
tds
wh
portal
es

orexigen
sovaldi
biotechs
harbinger
gild
growling
stretching
undercut
justify
lawmkers
pols
waxman
sailors
kudos
militants
handed
conventions
uss
elrod
relieves
stout
escorting
contraband
tourist
hogan
shoe
fades
jnj
flirts
stockers
ma
overturns
interchange
prosecutor
unaccounted
885mn
fhfa
885m
litigation
securities
budgetary
complication
us885m
rmbs
destruction
shoddy
settlements
tahoe
cbo
fmic
nationalizes
origination
refi
hamp
irs
preys
taxpayer
oklahomans
bbb
defraud
tigta
scammers
payers
scams
evert
netted
impersonators
unsuspecting
scamming
phony
telephone
bramwell
impostor
conmen
taxmen
criminals
sweeps
chesapeake
lease
soybean
karen
maccannell
bilks
inspector
xi
weaving
corrections
prisons
hoover
sweeping
contacted
thieves
impostors
immigrants
mimicking
turkey
wipe
biased
assassinations
graft
erdogan
turkish
gravely
protested
specter
circumvent
judiciary
ridicule
twittter
authoritarianism
purpose
nationalism
condemned
tweeters
downloaded
servic
turks
cihan
abdullah
gul
twitler

pup
roughed
bitches
cliff
gregg
allman
bronchitis
sidelines
allmans
ciara
shower
slits
lala
babyshower
attended
kris
bffs
houghton
rebhorn
homeland
quintessential
fockers
vinny
cheerleader
obituary
mistakenly
carter
melanoma
caucus
secrecy
cloaks
linn
mudslide
devastation
carlito
mathison
backyard
pool
cleanup
impedes
kotanow
scottsbluff
riverwalk
mickey
probaby
cigarrettes
corden
artpop
puked
rhobh
zeus
gonzo
brandi
glanville
spiky
gandhi
interpreting
layers
topless
axe
enlists
ghandi
yaaas
extravagant
completes
transformation
assisted
metaphor
vicious
hearst
rattled
botch
scarjo
soldier
flatters
johanasson
prelude
satin
basically
première
fab
louboutins
upstages
radiant
aplenty
westfield
ravishes
siren
leching
hayley
atwell
nick
westwood
america2
bj
disguises
belly
kz
vault
willie
hendrix
tupac
jerry
zappa
ew
dodo
deactivated
eventseeker
hillary
clintons
riffs
tags
candidacy
grills
amends
msg
tribeca
niro
50pc
enterprises
landslide
combo
sodium
spices
herbs
salted
adolescents
diets
k

hypocritical
iribe
devouring
predate
floundering
argue
correlation
dimension
proofing
engaging
valkyrie
unconvinced
dice
footsteps
leaf
byte
overlords
moonshot
classrooms
scrapped
adrift
razer
cantillon
detached
multitouch
trackpad
handles
warrants
govts
volumes
methodology
120pc
balloon
requested
readly
checkmark
nickelodeon
crittercism
gingerbread
divide
expenses
desired
penwortham
listings
alight
arson
rotherham
rochdale
moped
crams
retention
upto
apes
phototagging
embedding
inviting
cloudera
apache
kingmaker
concurrent
prairie
downlisting
arroyo
toad
endangered
cornyn
brownback
inhofe
hays
southeastern
plf
downlist
sizes
repeats
varieties
squared
wraparound
similarity
hitters
advertise
ge
puffery
prismtech
standardize
monetizing
restroom
allentown
outskirts
nickname
earths
zeroing
shepherded
huh
extremities
kuiper
eccentric
faraway
flung
orb
inuit
betray
hinting
boundary
inhabit
barren
boundaries
redrawn
fringe
diameter
neighbour
callers
ranged
frantic
wtc
arrests
advertisers
plott

creek
tuscaloosa
chilton
horses
dalton
regains
regained
provoking
breakfasters
jars
mccafe
broadside
vero
mccafé
wendys
quips
namedrops
humurous
advertises
squash
yea
nay
nuff
deutsch
glanz
strangest
quiero
delegate
malley
retroactive
obsolesce
carolinas
x3
lacrosse
galleries
facebookification
maturing
microblog
viability
swaps
yorktown
existed
exomars
sandpit
stevenage
yard
rovers
sandbox
cleaner
mcgill
coupe
douche
dustup
coolant
potholes
erupting
dock
broncos
taxis
unforeseen
hellish
internationa
hatch
administrator
dmitry
lovetsky
kswo
lawton
dealyed
requesting
complied
recived
groping
fivefold
font
typeface
proffers
devised
garamond
fourteen
290m
suvir
mirchandani
240m
400m
roman
typefaces
fonts
370m
brainy
helvetica
136m
educational
covertly
booby
paralyze
hose
swiftkey
vuln
unusable
installing
bricked
officesuite
appstore
reg
tabbed
fortify
inspecting
typo
injunction
seacrests
infringing
plaint
barring
occupy
mdm
finest
chuffed
brags
suiteto
mcsherry
philosopher
metaphysical
wd


zimmer
azalea
ladys
capt
luc
bynes
neon
alpine
arriba
thanking
bangin
cabo
sunny
uprising
solder
surveyed
boffo
mccarthyite
wizard
96m
walleye
mentioned
plexes
apirl
donner
goonies
feldman
yep
circulates
astoria
moonie
robins
sandpoint
faxing
heal
tearfully
blowup
coyote
palbociclib
pfs
letrozole
prolongs
hormone
receptor
pertaining
pfizers
aacr
cdk4
inhibitors
cdk
inhibitor
20k
19k
feasible
kath
liquids
refill
termed
poisonings
fattest
thinnest
fittest
skinniest
loveland
lassa
undertaken
aftrica
wvu
banquet
relaying
sarina
covington
calexico
bridgewater
janine
dolled
realtor
famu
bendigo
biopsy
recurring
prostate
prostrate
radiotherapy
refrains
topix
takeda
audjpy
y612
clamour
slighly
volcker
clo
rancher
grazing
blm
collecting
snipers
rangeland
rounding
bundy
northeastern
seizing
gg
gpn
qcor
tk
mersch
understood
simulates
faz
demolish
capitulate
axa
iggo
floggings
1tn
strictly
swirls
thwarting
parasite
chobani
picket
digg
unabomber
delusional
offspring
zealots
vc
anarchists
flyer
potr

frivolous
fends
filers
extenders
procrastinate
fairest
mat
mats
azodicarbonamide
numbered
phased
bleaching
performin
retrial
clerk
advisories
axton
damp
prevailing
resonates
dxy
licking
krona
dovishness
consolidating
outplaying
quietus
recoups
iac
diller
technolgy
interactivecorp
iaci
barbarian
geologists
seismic
quakes
earthquaked
odnr
hydraulic
fracturing
mahoning
horizontal
48pm
update2
utica
marcellus
sting
sprinkles
dented
sandestin
unca
pullin
27b
perking
profiting
mlps
stagnate
irritates
frightened
kaepernick
agostini
49ers
niners
kap
administered
manziel
misplaced
kaerpernick
innocence
49er
nflers
lockette
alcoa
aflac
topaz
c7
5mm
rfdg
orenburg
marseille
provence
metropole
ccc
voronezh
kaluga
lazio
busto
arsizio
tula
sardinia
centercredit
atf
tgp
idrs
validates
capers
ggg
zappos
mayday
desk
fulfillment
gobble
sustainability
unquote
nosedives
liquidate
liquidation
wausau
coldwatter
holyoke
clothier
cwtr
cwc
liquidating
75mm
altima
sentra
bulbous
mailbox
techcrunch
carousel
organ

blondie
trun
bindi
crochet
revenant
inarritu
fur
trapper
iñárritu
grizzly
renevant
babel
lutheran
aitkin
manteca
mchendry
straightening
ntagalipublish
practicing
hymns
contemplation
elgin
koininia
persecuted
advertiser
manette
crossman
buckhead
daventry
pophal
chrism
camden
ascension
vianney
merriment
breakfasts
bunny
carnett
jasper
dinners
eastermeans
evangelism
methodists
shuler
zombeavers
scheduling
kazan
vines
rosenthal
comingsoon
6secfilms
iverson
sylvio
kavanagh
zombeaver
reprimand
complainant
accidently
tweetvestigation
unruly
planegina
unbelievably
moly
shoved
filmgoing
corum
colton
burpo
depiction
afterlife
earthly
interpret
preaches
humane
brimstone
ambivalent
lumenick
midtown
rescuers
burglary
wan
shehashadit
carters
stacking
chainz
hov
hotspot
ranger
dir
119mins
immortality
jarvis
devolving
anupama
transcending
hairstyles
amex
cardholder
axp
camry
xse
vanilla
grrr
refuel
midsize
injection
restyled
thorough
avalon
grille
cheapens
pedaled
annapolis
leopard
edgier
sportiest
co

complementary
48m
montly
evaporating
anticompetitive
megadeal
valeant
agn
amassed
hedgie
ravenous
calpine
unrivaled
vrx
13d
oberhelman
arraigned
16bn
exchanging
2014gsk
elanco
preference
archiving
dangles
homeat
sparkman
gigawashing
lyrid
lyrids
meteors
showers
coincides
nood
tayo
ng
globalselfie
duckfaces
hawthorne
maiden
touting
environmentally
waxes
eloquent
recycles
polishing
controvercial
neb
summ
khgi
countersuit
linkbait
sdi
dustproof
minyanville
csgp
dyniq
brean
humorous
selina
incompetency
childcare
forger
rothko
pollock
kooning
bergantinos
33m
painter
indicted
33mn
2014three
80m
conned
swindler
seville
videotape
bragged
wwhl
oscars2015
boyle
slumdog
leonarado
gracing
scripted
redfoo
anthems
pimp
liiiiiiive
valentin
bulimia
insecurities
brue
alcohols
snorted
sprinkled
palcohol
madd
powderised
rum
approving
sprinkle
drunkard
newschannel5
chauncey
mahan
intern
roc
fella
blackmail
extorter
blackmailing
rambunctious
careens
gere
padma
lakshmi
eggcitement
salman
rushdie
regicide
in

kodiak
wondered
yougov
soot
particulate
westchester
ulster
troubleshooting
avast
adblock
rce
gpo
grandparents
varsity
cwu
hastens
revealation
fireeye
unpatched
enisa
grandpa
punters
a2
2ghz
pureview
stateside
t2
ntt
docomo
damns
raspberry
piphone
i9152
defocus
lifelog
paisa
wasool
e4
mwc
8mp
rearrange
motorways
almunia
rebut
memos
bods
invokes
flagrantly
injunctions
brazenly
slaughters
cleary
pitting
summaryan
42375us
zips
wald
41k
s205
semester
redesgined
distinguished
trekkies
launche
ps5
kfyrtv
redeemed
reba
holtzclaw
mholtzclaw
dailypress
rixton
sherrie
tassler
upfronts
whcd
mchale
effusive
carpentry
lui
carats
bended
honkin
persistence
punchestown
newcaster
bolling
swoons
geroge
stunningly
peacekeeper
stl
engagment
princeton
luhansk
colleague
tvgn
cheapen
shatter
dolce
gabbana
eighteen
domenico
stefano
stafani
buzzed
naturalism
snarled
freditorial
shazam
evolutions
osborn
revlon
n650
worldoo
tomfoolery
mensxp
terrorizes
calleri
amaze
stinger
dissected
untangling
shalvis
salon
idyl

staas
expressroute
remoteapp
mohoro
singtel
gins
slowpokes
pesticides
colony
neonicotionoids
honeybee
insecticides
neonicotinoids
pesticide
honeybees
colder
imidacloprid
reregulating
neturality
portals
reclassifying
8kbps
redraft
techi
sanity
gridlock
reclassification
nots
toying
fresher
trialling
bebop
14mp
skycontroller
joystick
fisheye
bepop
32nd
pandering
spo
doppelgangers
riotously
terrorize
squashed
51million
hazes
unfriendly
dethrone
powerplay
thursdays
titilate
eventizing
donal
logue
tinkers
scalpel
uta
whatjayzsaidtosolange
violently
viciously
knowls
lunges
interpreter
whiskered
panti
laibach
runaways
ginsberg
pucci
nighties
7x05
alas
threesomes
mechanized
madman
sykes
bandages
pastâ
s4e6
s04e06
gatiss
kangaroo
shae
stannis
freetyrion
thermonuclear
bathtub
yara
theon
davos
braavos
courtrooms
overextended
disheartening
househunting
cakes
bono
cindy
bachelorhood
jumblatt
druze
resorted
goerge
prenupt
alma
prenuptial
miffed
ressler
aww
reconciles
mathers
ruess
enimem
mlive
emosh


clairton
wtrf
omnibus
criminalizing
landowners
koch
inputs
underpinned
retreated
folling
usd1300
usd102
str
wagers
liven
69b
1qfy15
funk
fax
expections
disquieting
intension
outgrow
trex
fq1
fyq1
maxim
rybolovlev
oligarch
costliest
62c
penthouse
fertilizer
billonaire
potash
potentate
toh
elena
rybolovleva
5bln
alimony
7billion
francs
48bn
us4
2014russian
2014corrected
whipsaw
disadvantaged
scrapbook
knit
apk
scrapbooks
nocturnal
clarion
nightsnake
nightsnakes
herpetologists
mulcahy
ucs
jamestown
verde
ultralight
laplet
ergonomic
absurdly
featherweights
conceding
trig
i7
ultrathin
fledged
climategate
waned
unstirred
zane
croak
supercell
fundamentalists
iea
unengaged
wdp
fiddles
wmn
soaking
forbids
eww
retaliates
emphatic
goodell
matsui
whipping
rollback
logjam
gandel
aeroflex
oru
romano
poore
terrace
slayings
honestly
kneecap
volte
tangible
beachhead
gs4
ibmedge
swath
pester
nosier
bewildering
badger
liddick
blaster
vrizzmo
ndreams
eutugxiqhry
skydieving
ception
frail
gaunt
johnathan
re

intercontinentalexchange
perecnt
433k
xhb
veiga
rothwell
preholiday
doomsdayers
tenneco
unimpeded
terrify
harrods
confiscated
wave3
mcconnell
ridesharing
wto
steelworkers
ustr
itn
komodo
slink
exhibited
500e
bardot
consisting
zoomable
mosaics
rack
bings
kairos
jellycar
solitaire
scdca
reders
escalated
hatchette
zola
informational
interlock
2500s
3500s
324m
collusion
325m
meeellion
406m
g4
gtat
scuttlebutt
circumpolar
delsanto
1am
seti
milkyway
reportpublish
astrobiologists
ets
extraterrestrials
shostak
rebadged
jims
counseled
oseary
pissing
doosh
disassembled
afte
helming
exited
aaaarrrrrghhh
wight
inhaling
inhales
chipmunk
squeaky
wormhole
kornel
mundruczo
nouvelle
2014dogs
hungarian
feher
isten
gump
paterson
uma
mush
spaghetti
joblo
dawdles
shakurâ
œf
youâ
unflinching
medici
descendant
cheatsheet
kanyestein
bedsheet
conceive
pilates
ancien
régime
garavani
chiacago
kanya
2014kim
decoy
scottie
pippen
hirst
cadet
aishwarya
amfar
supermodels
strut
magpies
5million
refaeli
beauties
ducky


collars
metta
notoriously
carpetbagger
63rd
gobber
nettles
picassos
kravings
athena
kourteny
kontroller
scooby
calmness
shadoe
wove
ellington
soothed
feuded
casem
dedications
deejay
rermember
yer
prescott
middletons
rik
mayall
hotdog
abstaining
prediabetic
prediabetes
lipid
beeb
cdph
sproutbreaks
teammates
parrsboro
marlow
attleboro
rowlesburg
valparaiso
joliet
relaywood
zeeland
133k
moshannon
dillsburg
tomato
toujeo
mayflies
algae
fertilizers
wmb
midstream
acmp
unitholders
bocomm
zloty
eassurance
normalisation
15t10
15t17
07z
ratestom
normalization
esteemed
recognitions
authorizations
paise
georgie
geyer
mcx
medvedev
belarusian
connivance
oettinger
petro
unconstructive
snips
ukrainy
delude
illarionov
vilnius
prepay
prepays
jihad
prepayments
unsc
lukashenko
minsk
cutoff
m2m
cobra
telematics
fabrizio
bensch
renounce
siptu
rebase
gnp
boro
oilfields
ruehl
shales
usd4
micex
storages
95b
khead
andrii
deshchytsia
tudor
pickering
wpz
2014airbus
launchers
twtc
dara
biosciences
herda
affymax
lv

spectrometers
oco2
encountered
vibrations
munching
caterpillars
beastly
specced
rs37800
maximized
minimalism
spiel
gfx
s8
s10
txt
cramming
chained
overcharging
fraudulently
crammed
broadsides
coeur
alene
hydroplane
appraisals
bilked
tucked
jacking
totalling
aptenodytes
forsteri
depts
itis
inverter
unblemished
bigfoot
unsurprising
sasquatch
yetis
hairs
himalayas
obliterated
chilean
oceanic
forage
garco
feltonville
torresdale
inhospitable
gliese
832c
battlestar
galactica
uninhabitable
cabal
driveless
spaniard
crawly
emoto
furtling
klitzman
ithacavoice
221back
morstan
abbington
broil
cunt
bau
07pm
whisperer
sarchie
oliva
predictability
trots
exorcisms
demonic
édgar
heebie
jeebies
hoariest
demonologist
howler
chesterton
deliever
flannery
jazmine
backlashes
annihilated
kiddnapping
ranchero
kink
lt
winsome
untugged
dkeira
mazursky
notables
eras
unmarried
farceur
admiral
countercultural
tonto
voluntarytreatment
tenenbaum
deviant
grouchiest
unmistakably
babysits
kicky
jaeden
lieberher
racetrac

roving
nasas
offearth
lunokhod
orbiters
repositions
reposition
seawater
acidification
0t
duran
prinze
teamkiefer
unprofessionalism
indytoday
bloodfeud
trailerculture
rhys
pabbie
whaddya
bilbo
baggins
thorin
gandalf
legolas
bowman
smaug
elves
birthed
unto
sdcc
annabelle
glorous
jester
elvish
nerdgasmic
tp
koin
cosplayers
cosplaying
zombiewalk
hawtness
ra
ghul
expos
olutely
leveled
rozanski
louvre
stranding
kosovo
madagascar
degrading
nuttymadam
bowls
batkid
leukemia
beysus
divorcè
stylists
igf
winnipeggers
surfboardt
sweethearts
stepbrother
rubbed
vacuous
micromanages
10x8
milf
smooched
tinted
bomby
istants
neglects
snogged
wifey
controling
neglecting
klips
ziering
snarknado
snuffing
saws
churl
classless
kabc7
richt
embiid
wotv
murin
dorfamn
hy
hitchhiker
rooker
racoon
xd
assortment
faris
braiding
170m
guradians
kwok
hyperdrive
rubdown
comrie
lizzie
fantasizes
daydreams
daydreamin
daydreaming
snacktime
summery
skywriting
confections
jazzed
jennny
handjobs
zeal
rilo
kiley
kimono
phobias


In [42]:
#using the k-fold technique to increase the accuracy of the model

cv = KFold(n_splits = 10,shuffle = True, random_state=0)
result = cross_val_score(nb, x, y, cv=cv)
print(result.mean())

0.9272688990572527


#### <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

### Logistic regression model

#### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [18]:
clf = OneVsRestClassifier(LogisticRegression())

# Fit the classifier to the training data
clf.fit(x_train, y_train)


# Print the accuracy
print("Accuracy: {}".format(clf.score(x_test, y_test)))



Accuracy: 0.9419771791108376


In [19]:
x_test_reg = clf.predict(x_test)

print(classification_report(y_test, x_test_reg,target_names=encoder.classes_))

              precision    recall  f1-score   support

           b       0.92      0.92      0.92     23047
           e       0.96      0.98      0.97     30287
           m       0.96      0.91      0.93      9262
           t       0.93      0.92      0.93     21888

   micro avg       0.94      0.94      0.94     84484
   macro avg       0.94      0.93      0.94     84484
weighted avg       0.94      0.94      0.94     84484



In [20]:
#result = cross_val_score(clf, x, y, cv=cv)
#print(result.mean())

#### <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

### Random Forest

#### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

In [21]:
from sklearn.ensemble import RandomForestClassifier



In [22]:
clf_rand = RandomForestClassifier()
# Fit the classifier to the training data
clf_rand.fit(x_train, y_train)

# Print the accuracy
print("Accuracy: {}".format(clf_rand.score(x_test, y_test)))



Accuracy: 0.9276431040196961


In [23]:
x_test_rand = clf_rand.predict(x_test)

print(classification_report(y_test, x_test_rand,target_names=encoder.classes_))

              precision    recall  f1-score   support

           b       0.90      0.92      0.91     23047
           e       0.94      0.97      0.96     30287
           m       0.94      0.88      0.91      9262
           t       0.94      0.90      0.92     21888

   micro avg       0.93      0.93      0.93     84484
   macro avg       0.93      0.92      0.92     84484
weighted avg       0.93      0.93      0.93     84484



#### <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

### Decision Trees

#### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

In [24]:
from sklearn import tree


In [25]:
clf_tree = tree.DecisionTreeClassifier()

clf_tree.fit(x_train, y_train)

# Print the accuracy
print("Accuracy: {}".format(clf_tree.score(x_test, y_test)))

Accuracy: 0.914587377491596


In [26]:
x_test_tree = clf_tree.predict(x_test)

print(classification_report(y_test, x_test_tree,target_names=encoder.classes_))

              precision    recall  f1-score   support

           b       0.89      0.90      0.89     23047
           e       0.94      0.95      0.95     30287
           m       0.90      0.88      0.89      9262
           t       0.91      0.89      0.90     21888

   micro avg       0.91      0.91      0.91     84484
   macro avg       0.91      0.91      0.91     84484
weighted avg       0.91      0.91      0.91     84484

