In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("bbc-text.csv")

In [3]:
data.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
data["category"].value_counts()

category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

In [5]:
import re
import nltk

In [6]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [7]:
stop_words = set(stopwords.words('english'))

In [8]:
corpus = []
for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z0-9]', ' ', data['text'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stop_words]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus

['tv futur hand viewer home theatr system plasma high definit tv digit video record move live room way peopl watch tv radic differ five year time accord expert panel gather annual consum electron show la vega discuss new technolog impact one favourit pastim us lead trend programm content deliv viewer via home network cabl satellit telecom compani broadband servic provid front room portabl devic one talk technolog ce digit person video record dvr pvr set top box like us tivo uk sky system allow peopl record store play paus forward wind tv programm want essenti technolog allow much personalis tv also built high definit tv set big busi japan us slower take europ lack high definit program peopl forward wind advert also forget abid network channel schedul put togeth la cart entertain us network cabl satellit compani worri mean term advertis revenu well brand ident viewer loyalti channel although us lead technolog moment also concern rais europ particularli grow uptak servic like sky happen 

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
x=tv.fit_transform(corpus).toarray()

In [11]:
y=data["category"]

In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0)

In [13]:
from sklearn.ensemble import RandomForestClassifier
rfg=RandomForestClassifier(random_state=0)

In [14]:
rfg.fit(x_train,y_train)

In [15]:
y_pred=rfg.predict(x_test)

In [16]:
y_test.head()

384     politics
1983       sport
985        sport
1386       sport
1294       sport
Name: category, dtype: object

In [17]:
y_pred[0:5]

array(['politics', 'sport', 'sport', 'sport', 'sport'], dtype=object)

In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9658886894075404

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

               precision    recall  f1-score   support

     business       0.98      0.97      0.97       126
entertainment       0.97      0.95      0.96        95
     politics       0.95      0.96      0.95        94
        sport       0.97      1.00      0.99       147
         tech       0.96      0.94      0.95        95

     accuracy                           0.97       557
    macro avg       0.96      0.96      0.96       557
 weighted avg       0.97      0.97      0.97       557



In [26]:
input="security warning over  fbi virus  the us federal bureau of investigation is warning that a computer virus is being spread via e-mails that purport to be from the fbi.  the e-mails show that they have come from an fbi.gov address and tell recipients that they have accessed illegal websites. the messages warn that their internet use has been monitored by the fbi s internet fraud complaint center. an attachment in the e-mail contains the virus  the fbi said. the message asks recipients to click on the attachment and answer some questions about their internet use. but rather than being a questionnaire  the attachment contains a virus that infects the recipient s computer  according to the agency. it is not clear what the virus does once it has infected a computer. users are warned never to open attachment from unsolicited e-mails or from people they do not know.   recipients of this or similar solicitations should know that the fbi does not engage in the practice of sending unsolicited e-mails to the public in this manner   the fbi said in a statement. the bureau is investigating the phoney e-mails. the agency earlier this month shut down fbi.gov accounts  used to communicate with the public  because of a security breach. a spokeswoman said the two incidents appear to be unrelated."

In [27]:
input_df=tv.transform([input])

In [28]:
Prediction=rfg.predict(input_df)

In [29]:
print(Prediction[0])

tech


In [24]:
# Serialization (the process of converting a Python object (like your trained model) into a file)
import joblib
joblib.dump(tv,"vectorizer.pkl")
joblib.dump(rfg,"model.pkl")

['model.pkl']

In [None]:
# Kohli goes down the ground! Kohli goes out of the ground
# The highly anticipated sequel to the blockbuster sci-fi film is set to hit cinemas this summer, the studio confirmed today. The movie's director promised fans an even more spectacular visual experience, with many of the original cast members reprising their roles. A full-length trailer is expected to be released next month, sparking huge excitement on social media.
# Shares in the retail giant surged by 15% after it reported record profits for the fourth quarter, far exceeding market expectations. The company attributed its strong performance to a significant increase in online sales and successful cost-cutting measures. Investors are now watching closely to see if this growth can be sustained amid growing competition and rising inflation.
# blair prepares to name poll date tony blair is likely to name 5 may as election day when parliament returns from its easter break  the bbc s political editor has learned.  andrew marr says mr blair will ask the queen on 4 or 5 april to dissolve parliament at the end of that week. mr blair has so far resisted calls for him to name the day but all parties have stepped up campaigning recently. downing street would not be drawn on the claim  saying election timing was a matter for the prime minister.  a number 10 spokeswoman would only say:  he will announce an election when he wants to announce an election.  the move will signal a frantic week at westminster as the government is likely to try to get key legislation through parliament. the government needs its finance bill  covering the budget plans  to be passed before the commons closes for business at the end of the session on 7 april.  but it will also seek to push through its serious and organised crime bill and id cards bill. mr marr said on wednesday s today programme:  there s almost nobody at a senior level inside the government or in parliament itself who doesn t expect the election to be called on 4 or 5 april.  as soon as the commons is back after the short easter recess  tony blair whips up to the palace  asks the queen to dissolve parliament ... and we re going.  the labour government officially has until june 2006 to hold general election  but in recent years governments have favoured four-year terms.