In [38]:
import pandas as pd

In [39]:
df = pd.read_excel('news.xls')

In [40]:
df_comment = df.drop(columns=['No'])

In [41]:
df_comment.head()

Unnamed: 0,News Title,Category
0,Google+ rolls out 'Stories' for tricked out ph...,Technology
1,Dov Charney's Redeeming Quality,Business
2,White God adds Un Certain Regard to the Palm Dog,Entertainment
3,"Google shows off Androids for wearables, cars,...",Technology
4,China May new bank loans at 870.8 bln yuan,Business


In [42]:
df_comment.shape

(65535, 2)

In [43]:
X = df_comment['News Title']
y = df_comment['Category']

# Naive Bayes

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [45]:
cv = CountVectorizer()
X = cv.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [46]:
from sklearn.naive_bayes import MultinomialNB

In [47]:
nb = MultinomialNB()
nb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [48]:
predictions = nb.predict(X_test)

In [49]:
from sklearn.metrics import confusion_matrix,classification_report

In [50]:
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[4687  159   77  420]
 [ 101 6925   61  154]
 [ 140  101 1839   58]
 [ 353  117   54 4415]]


               precision    recall  f1-score   support

     Business       0.89      0.88      0.88      5343
Entertainment       0.95      0.96      0.95      7241
      Medical       0.91      0.86      0.88      2138
   Technology       0.87      0.89      0.88      4939

  avg / total       0.91      0.91      0.91     19661



# Logistic Regression

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [54]:
lg = LogisticRegression()
lg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [55]:
lg_pred = lg.predict(X_test)

In [56]:
print(confusion_matrix(y_test,lg_pred))
print('\n')
print(classification_report(y_test,lg_pred))

[[4816  139   51  337]
 [ 103 7045   33   60]
 [ 139  104 1845   50]
 [ 313  138   25 4463]]


               precision    recall  f1-score   support

     Business       0.90      0.90      0.90      5343
Entertainment       0.95      0.97      0.96      7241
      Medical       0.94      0.86      0.90      2138
   Technology       0.91      0.90      0.91      4939

  avg / total       0.92      0.92      0.92     19661



## Test

In [78]:
def pred_news(news):
    news = [news]
    news = cv.transform(news)
    return nb.predict(news)

In [79]:
pred_news('Google launches new version of android')

array(['Technology'], dtype='<U13')

In [80]:
pred_news('Twice released new full album with title eyes wide open')

array(['Entertainment'], dtype='<U13')

In [81]:
pred_news('Covid 19 in indonesia rises, World Health Organization warn indonesia government')

array(['Medical'], dtype='<U13')

In [82]:
pred_news('Massive $27b tax break for 99pc of businesses')

array(['Business'], dtype='<U13')