<a href="https://colab.research.google.com/github/ahmedm-g/NewsClassifier/blob/main/news_classifier_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support as score
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# load bbc news data
def get_data():
  url = 'https://raw.githubusercontent.com/ahmedm-g/NewsClassifier/main/data_train_news_bbc.csv'
  return pd.read_csv(url)
data = get_data()

In [None]:
print(data.head())

   ArticleId                                               Text  Category
0       1833  worldcom ex-boss launches defence lawyers defe...  business
1        154  german business confidence slides german busin...  business
2       1101  bbc poll indicates economic gloom citizens in ...  business
3       1976  lifestyle  governs mobile choice  faster  bett...      tech
4        917  enron bosses in $168m payout eighteen former e...  business


In [None]:
data.shape

(1490, 3)

In [None]:
def lower(text):
  return text.lower()
data['Text'] = data['Text'].apply(lower)
# data['Text'].head()

In [None]:
import string
from nltk.corpus import stopwords

english_stops = stopwords.words('english')
english_stops += list(string.punctuation)

In [None]:
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
  tokens = word_tokenize(text)
  return ' '.join([token for token in tokens if token not in english_stops])

data['Text'] = data['Text'].apply(remove_stopwords)
data['Text'].head()

0    worldcom ex-boss launches defence lawyers defe...
1    german business confidence slides german busin...
2    bbc poll indicates economic gloom citizens maj...
3    lifestyle governs mobile choice faster better ...
4    enron bosses 168m payout eighteen former enron...
Name: Text, dtype: object

In [None]:
from nltk.stem import WordNetLemmatizer

def lemmatize_text(text):
  lemmatizer = WordNetLemmatizer()
  tokens = word_tokenize(text)
  lemmatized_words = [lemmatizer.lemmatize(token) for token in tokens]
  return ' '.join(lemmatized_words)

data['Text'] = data['Text'].apply(lemmatize_text)
data['Text'].head()

0    worldcom ex-boss launch defence lawyer defendi...
1    german business confidence slide german busine...
2    bbc poll indicates economic gloom citizen majo...
3    lifestyle governs mobile choice faster better ...
4    enron boss 168m payout eighteen former enron d...
Name: Text, dtype: object

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
y = np.array(data.Category.values)
count_vectroizer = CountVectorizer(max_features = 5000)
x = count_vectroizer.fit_transform(data.Text).toarray()
print("X.shape = ",x.shape)
print("y.shape = ",y.shape)

NameError: ignored

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42, shuffle = True)
print(len(x_train))
print(len(x_test))

1043
447


In [None]:
# Create list of model and accuracy dicts
perform_list = [ ]

def run_model(model_name, est_c, est_pnlty):
    mdl = ""
    if model_name == 'Logistic Regression':
        mdl = LogisticRegression()
    elif model_name == 'Random Forest':
        mdl = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0)
    elif model_name == 'Multinomial Naive Bayes':
        mdl = MultinomialNB(alpha=1.0,fit_prior=True)
    elif model_name == 'Support Vector Classifer':
        mdl = SVC()
    elif model_name == 'Decision Tree Classifier':
        mdl = DecisionTreeClassifier()
    elif model_name == 'K Nearest Neighbour':
        mdl = KNeighborsClassifier(n_neighbors=10 , metric= 'minkowski' , p = 4)
    elif model_name == 'Gaussian Naive Bayes':
        mdl = GaussianNB()

    oneVsRest = OneVsRestClassifier(mdl)
    oneVsRest.fit(x_train, y_train)
    y_pred = oneVsRest.predict(x_test)

    # Performance metrics
    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)

    # Get precision, recall, f1 scores
    precision, recall, f1score, support = score(y_test, y_pred, average='micro')
    print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')
    print(f'Precision : {precision}')
    print(f'Recall : {recall}')
    print(f'F1-score : {f1score}')

    # Round the precision, recall, and f1score values
    precision_rounded = round(precision, 2)
    recall_rounded = (recall, 2)
    f1score_rounded = round(f, 2) for f in f1score]

    # Add performance parameters to list
    perform_list.append(dict([
        ('Model', model_name),
        ('Test Accuracy', accuracy),
        ('Precision', precision_rounded),
        ('Recall', recall_rounded),
        ('F1', f1score_rounded)
    ]))

In [None]:
run_model('Logistic Regression', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Logistic Regression: % 96.64
Precision : [0.97222222 0.96296296 0.96385542 0.95283019 0.98550725]
Recall : [0.97222222 0.98734177 0.93023256 1.         0.93150685]
F1-score : [0.97222222 0.975      0.94674556 0.97584541 0.95774648]


In [None]:
run_model('Multinomial Naive Bayes', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Multinomial Naive Bayes: % 97.99
Precision : 0.9798657718120806
Recall : 0.9798657718120806
F1-score : 0.9798657718120806


In [None]:
run_model('Random Forest', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Random Forest: % 97.99
Precision : 0.9798657718120806
Recall : 0.9798657718120806
F1-score : 0.9798657718120806


In [None]:
run_model('Support Vector Classifer', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Support Vector Classifer: % 95.97
Precision : [0.99047619 0.97435897 0.96296296 0.97087379 0.8875    ]
Recall : [0.96296296 0.96202532 0.90697674 0.99009901 0.97260274]
F1-score : [0.97652582 0.96815287 0.93413174 0.98039216 0.92810458]


In [None]:
run_model('Decision Tree Classifier', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Decision Tree Classifier: % 80.09
Precision : [0.94444444 0.94915254 0.8255814  0.93137255 0.53125   ]
Recall : [0.62962963 0.70886076 0.8255814  0.94059406 0.93150685]
F1-score : [0.75555556 0.8115942  0.8255814  0.93596059 0.67661692]


In [None]:
run_model('K Nearest Neighbour', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic K Nearest Neighbour: % 70.02
Precision : [0.77777778 0.83870968 0.81818182 0.52879581 0.94736842]
Recall : [0.64814815 0.65822785 0.62790698 1.         0.49315068]
F1-score : [0.70707071 0.73758865 0.71052632 0.69178082 0.64864865]


In [None]:
run_model('Gaussian Naive Bayes', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Gaussian Naive Bayes: % 77.18
Precision : [0.94666667 0.95833333 1.         1.         0.4251497 ]
Recall : [0.65740741 0.58227848 0.69767442 0.96039604 0.97260274]
F1-score : [0.77595628 0.72440945 0.82191781 0.97979798 0.59166667]


In [None]:
model_performance = pd.DataFrame(data=perform_list)
model_performance = model_performance[['Model', 'Test Accuracy', 'Precision', 'Recall', 'F1']]
model_performance

Unnamed: 0,Model,Test Accuracy,Precision,Recall,F1
0,Logistic Regression,96.64,"[0.97, 0.96, 0.96, 0.95, 0.99]","[0.97, 0.99, 0.93, 1.0, 0.93]","[0.97, 0.98, 0.95, 0.98, 0.96]"
1,Logistic Regression,96.64,"[0.97, 0.96, 0.96, 0.95, 0.99]","[0.97, 0.99, 0.93, 1.0, 0.93]","[0.97, 0.98, 0.95, 0.98, 0.96]"
2,Multinomial Naive Bayes,97.99,"[0.99, 1.0, 0.95, 0.99, 0.96]","[0.95, 0.99, 0.98, 1.0, 0.99]","[0.97, 0.99, 0.97, 1.0, 0.97]"
3,Random Forest,97.99,"[0.98, 0.99, 0.98, 1.0, 0.95]","[0.97, 1.0, 0.97, 1.0, 0.96]","[0.98, 0.99, 0.97, 1.0, 0.95]"
4,Support Vector Classifer,95.97,"[0.99, 0.97, 0.96, 0.97, 0.89]","[0.96, 0.96, 0.91, 0.99, 0.97]","[0.98, 0.97, 0.93, 0.98, 0.93]"
5,Decision Tree Classifier,80.09,"[0.94, 0.95, 0.83, 0.93, 0.53]","[0.63, 0.71, 0.83, 0.94, 0.93]","[0.76, 0.81, 0.83, 0.94, 0.68]"
6,K Nearest Neighbour,70.02,"[0.78, 0.84, 0.82, 0.53, 0.95]","[0.65, 0.66, 0.63, 1.0, 0.49]","[0.71, 0.74, 0.71, 0.69, 0.65]"
7,Gaussian Naive Bayes,77.18,"[0.95, 0.96, 1.0, 1.0, 0.43]","[0.66, 0.58, 0.7, 0.96, 0.97]","[0.78, 0.72, 0.82, 0.98, 0.59]"


In [None]:
# Determining best model based on accuracy
model = model_performance["Model"]
max_value = model_performance["Test Accuracy"].max()
print("The best accuracy of model is", max_value,"from Random")

The best accuracy of model is 97.99 from Random


In [None]:
classifier = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0).fit(x_train, y_train)
classifier
y_pred = classifier.predict(x_test)