In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('news.xls')

In [3]:
df_comment = df.drop(columns=['No'])

In [4]:
df_comment.head(20)

Unnamed: 0,News Title,Category
0,Google+ rolls out 'Stories' for tricked out ph...,Technology
1,Dov Charney's Redeeming Quality,Business
2,White God adds Un Certain Regard to the Palm Dog,Entertainment
3,"Google shows off Androids for wearables, cars,...",Technology
4,China May new bank loans at 870.8 bln yuan,Business
5,Firefox Windows 8 Metro Browser Development Ca...,Technology
6,Destiny Beta Kicks Off In July,Technology
7,Apple & Google's Motorola end legal battle,Technology
8,UPDATE 2-Facebook Q1 revenue grows 72 percent ...,Business
9,"Selena Gomez, Justin Bieber Spotted at the Sam...",Entertainment


In [5]:
df_comment.shape

(65535, 2)

In [24]:
X = df_comment['News Title']
y = df_comment['Category']

# Naive Bayes

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
cv = CountVectorizer()
X = cv.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [9]:
from sklearn.naive_bayes import MultinomialNB

In [10]:
nb = MultinomialNB()
nb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
predictions = nb.predict(X_test)

In [12]:
from sklearn.metrics import confusion_matrix,classification_report

In [25]:
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[4717  123   91  388]
 [ 122 6909   64  148]
 [ 116  114 1821   57]
 [ 367  142   59 4423]]


               precision    recall  f1-score   support

     Business       0.89      0.89      0.89      5319
Entertainment       0.95      0.95      0.95      7243
      Medical       0.89      0.86      0.88      2108
   Technology       0.88      0.89      0.88      4991

  avg / total       0.91      0.91      0.91     19661



# Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

In [29]:
lg = LogisticRegression()
lg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
lg_pred = lg.predict(X_test)

In [31]:
print(confusion_matrix(y_test,lg_pred))
print('\n')
print(classification_report(y_test,lg_pred))

[[4798  131   55  335]
 [ 113 7043   32   55]
 [ 124  100 1836   48]
 [ 316  159   30 4486]]


               precision    recall  f1-score   support

     Business       0.90      0.90      0.90      5319
Entertainment       0.95      0.97      0.96      7243
      Medical       0.94      0.87      0.90      2108
   Technology       0.91      0.90      0.90      4991

  avg / total       0.92      0.92      0.92     19661



# Random Forest

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [41]:
rf = RandomForestClassifier(n_estimators=300)

In [42]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [43]:
rf_pred = rf.predict(X_test)

In [44]:
print(confusion_matrix(y_test,rf_pred))
print('\n')
print(classification_report(y_test,rf_pred))

[[4476  490   59  294]
 [ 185 6952   47   59]
 [ 178  321 1545   64]
 [ 355  491   50 4095]]


               precision    recall  f1-score   support

     Business       0.86      0.84      0.85      5319
Entertainment       0.84      0.96      0.90      7243
      Medical       0.91      0.73      0.81      2108
   Technology       0.91      0.82      0.86      4991

  avg / total       0.87      0.87      0.87     19661



## Test

In [14]:
def pred_news(news):
    news = [news]
    news = cv.transform(news)
    return nb.predict(news)

In [15]:
pred_news('Google launches new version of android')

array(['Technology'], dtype='<U13')

In [17]:
pred_news('Twice released new full album with title eyes wide open')

array(['Entertainment'], dtype='<U13')

In [19]:
pred_news('Covid 19 in indonesia rises, World Health Organization warn indonesia government')

array(['Medical'], dtype='<U13')

In [23]:
pred_news('Massive $27b tax break for 99pc of businesses')

array(['Business'], dtype='<U13')