### Import Required Libraries

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

### Load and understand Dataset

In [3]:
data = pd.read_csv("BBC News Train.csv")

In [4]:
data

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [6]:
set(data.Category)

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [7]:
data.shape

(1490, 3)

### Feature Extraction

In [9]:
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")

In [10]:
X = tfidf.fit_transform(data['Text'])

In [13]:
X.toarray().shape

(1490, 5000)

In [14]:
y = data['Category']

### Model Selection

In [16]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2)

In [17]:
xtrain.shape

(1192, 5000)

In [18]:
xtest.shape

(298, 5000)

### training the model

In [23]:
svc =  SVC(kernel = "linear")
svc.fit(xtrain,ytrain)
ypred = svc.predict(xtest)
accuracy_score(ypred,ytest)

0.9765100671140939

In [24]:
svc =  SVC(kernel = "rbf")
svc.fit(xtrain,ytrain)
ypred = svc.predict(xtest)
accuracy_score(ypred,ytest)

0.9731543624161074

In [25]:
svc =  SVC(kernel = "poly")
svc.fit(xtrain,ytrain)
ypred = svc.predict(xtest)
accuracy_score(ypred,ytest)

0.7718120805369127

In [26]:
svc =  SVC(kernel = "sigmoid")
svc.fit(xtrain,ytrain)
ypred = svc.predict(xtest)
accuracy_score(ypred,ytest)

0.9731543624161074

### best model

In [27]:
svc =  SVC(kernel = "linear")
svc.fit(xtrain,ytrain)
ypred = svc.predict(xtest)
accuracy_score(ypred,ytest)

0.9765100671140939

### Text Classification

In [28]:
headline = ["Elon musk plans to buy twitter for 44 Billion Dollars"]

In [29]:
transformed_headline = tfidf.transform(headline)

In [30]:
transformed_headline

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [31]:
svc.predict(transformed_headline)

array(['business'], dtype=object)

In [32]:
headline = ["Tom cruise is planning to work on next mission impossible movie"]
transformed_headline = tfidf.transform(headline)
transformed_headline
svc.predict(transformed_headline)

array(['entertainment'], dtype=object)

In [50]:
headline = ["howard  truanted to play snooker  conservative leader michael howard has admitted he used to play truant to spend time with his school friends at a snooker hall.  mr howard said his time at jack s snooker hall in llanelli in the 1950s had not done him  any lasting damage . but he told the times educational supplement that truancy was  very bad  and said  firm action  was needed. mr howard also called for a return to o-levels and more classroom discipline.  mr howard eventually left llanelli grammar school - and the snooker hall - to go to cambridge university. he said:  i don t think it s done me any lasting damage. nor has it made me a snooker world champion.  there might have been some occasions when we left early of an afternoon.   i m just being honest. i think truancy is a very bad thing and that firm action should be taken to deal with it.  another player who has failed to win snooker s world championship - jimmy  the whirlwind   white - has previously admitted missing lessons  instead spending his days in smoky halls.  tony meo [another player] and me used to spend all of our spare time there   mr white said   we loved the game and the atmosphere.  school went out of the window. i went for a while and then started taking time off.  mr howard s fellow welshman ray reardon - known by his fellow professionals as  dracula  - won the snooker world championship six times  having left school at 14 to work as a miner. and terry griffiths  like mr howard from llanelli  won the tournament in 1979. it is not known whether the two of them ever clashed cues at jack s."]
transformed_headline = tfidf.transform(headline)
transformed_headline
svc.predict(transformed_headline)

array(['politics'], dtype=object)

In [51]:
data[data['Category'] == 'politics']['Text'][5]

'howard  truanted to play snooker  conservative leader michael howard has admitted he used to play truant to spend time with his school friends at a snooker hall.  mr howard said his time at jack s snooker hall in llanelli in the 1950s had not done him  any lasting damage . but he told the times educational supplement that truancy was  very bad  and said  firm action  was needed. mr howard also called for a return to o-levels and more classroom discipline.  mr howard eventually left llanelli grammar school - and the snooker hall - to go to cambridge university. he said:  i don t think it s done me any lasting damage. nor has it made me a snooker world champion.  there might have been some occasions when we left early of an afternoon.   i m just being honest. i think truancy is a very bad thing and that firm action should be taken to deal with it.  another player who has failed to win snooker s world championship - jimmy  the whirlwind   white - has previously admitted missing lessons  