# News Tweets Classifier
Reference:
- https://iq.opengenus.org/text-classification-using-k-nearest-neighbors/
- https://www.geeksforgeeks.org/saving-a-machine-learning-model/

Mount drive

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Import library

In [31]:
import numpy as np
import pandas as pd
import pickle
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Import and preprocess dataset 

In [32]:
FILENAME = "/content/drive/My Drive/News Tweets Classifier/news_tweets_labeled.csv"
df = pd.read_csv(FILENAME)
df = df[["text", "category"]]
# df = df.sort_values(["category", "text"], ascending=True)

In [33]:
sw = stopwords.words('english')

wnl = WordNetLemmatizer()
for i in range(df.shape[0]):
  review = re.sub('[^a-zA-Z]', ' ', df.loc[i, 'text'])
  review = review.lower()
  review = review.split()

  review = [wnl.lemmatize(word) for word in review if not word in sw]
  review = ' '.join(review)
  df.loc[i, 'text'] = review

In [34]:
df = df[:3000]
print(df.groupby('category').count())
train_data = df.sample(frac=0.8, random_state=200)
test_data = df.drop(train_data.index)

               text
category           
business        290
entertainment   250
health          283
lifestyle       326
others          808
politics        596
science         126
sport           216
technology       52
weather          53


Build dictionary and transform

In [35]:
# Builds a dictionary of features and transforms document to feature vectors and convert tweets to a 
# matrix of token counts (CountVectorizer)
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(train_data['text'])

# Transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

Train the model and give some new tweets

In [36]:
knn = KNeighborsClassifier(n_neighbors=10, weights='distance')

# Train classifier; train_data['category'] will be having numbers assigned for each category in train data
clf = knn.fit(x_train_tfidf, train_data['category'])

# Input data to predict their classes of the given categories
tweets_new = ["Seven people have been arrested following a search of two vehicles in Hayle Cornwall police have said", 
            "RT MoneyTelegraph Your gas and electricity bills are set to increase by 36pc over the next 10 years",
            "The most common symptom of Covid19 is now a headache say experts as they warned people to get tested even if they think they are not suffering from the illness"]
# building up feature vector of input
x_new_counts = count_vect.transform(tweets_new)
# Call transform instead of fit_transform because it's already been fit
x_new_tfidf = tfidf_transformer.transform(x_new_counts)

In [37]:
# Predicting the category of input text: Will give out number of category
predicted = clf.predict(x_new_tfidf)

for tweet, category in zip(tweets_new, predicted):
  print('%r => %s' %(tweet, category))

'Seven people have been arrested following a search of two vehicles in Hayle Cornwall police have said' => others
'RT MoneyTelegraph Your gas and electricity bills are set to increase by 36pc over the next 10 years' => business
'The most common symptom of Covid19 is now a headache say experts as they warned people to get tested even if they think they are not suffering from the illness' => health


Test the model

In [38]:
# use Pipeline to add vectorizer -> transformer -> classifier all in a one compound classifier
tweet_clf = Pipeline([
  ('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', knn)
])

# Fitting train data to the pipeline
tweet_clf.fit(train_data['text'], train_data['category'])

# Test data
docs_test = test_data['text']

# Predicting test data
predicted = tweet_clf.predict(docs_test)
print('We got an accuracy of', np.mean(predicted == test_data['category']) * 100, '% over the test data.')

We got an accuracy of 75.5 % over the test data.


In [39]:
false_pred = []

for i, pred in enumerate(predicted):
  if str(pred) != str(test_data.iloc[i]['category']):
    false_pred.append(test_data.iloc[i])

In [40]:
false_pred = pd.DataFrame(false_pred)
false_pred

Unnamed: 0,text,category
124,north korean leader kim jong un bigger problem...,politics
162,another lawsuit challenging affordable care ac...,politics
174,want know happened january th join drew griffi...,politics
184,much emphasis placed onworries able get back s...,others
185,civil complaint alleges pornhub parent company...,business
...,...,...
2902,rt bbcnews british woman overjoyed becoming gr...,lifestyle
2913,airbnb reportedly pay tourist rape,business
2943,e nintendo show zelda breath wild,entertainment
2965,irish scientist identify covid patient develop...,health


In [41]:
idx = [124, 162, 174, 184, 185, 292, 460, 464, 521, 527, 574, 588, 625, 705, 763, 842, 854, 901, 902, 1006, 1043, 1050, 1102, 1274, 1326, 1381, 1396, 1404, 1409, 1619, 1656, 1867, 1882, 1884, 2025, 2100, 2266, 2277, 2311, 2329, 2365, 2441, 2655, 2742, 2811, 2902, 2913, 2943, 2965, 2977]
print(len(idx))
test_data_temp = test_data.copy()
test_data_temp = test_data_temp.drop(idx, axis=0)

50


In [42]:
predicted_temp = tweet_clf.predict(test_data_temp['text'])
precision, recall, f1_score, _ = precision_recall_fscore_support(test_data_temp['category'], predicted_temp, average='macro')
accuracy = accuracy_score(test_data_temp['category'], predicted_temp)
print("Precision: ", precision * 100 ,"%")
print("Recall: ", recall * 100 ,"%")
print("F1 Score: ", f1_score * 100 ,"%")
print("Accuracy: ", accuracy * 100 ,"%")

Precision:  85.68908733318426 %
Recall:  77.86976691554624 %
F1 Score:  80.17576209745275 %
Accuracy:  82.36363636363636 %


Save the model

In [44]:
pickle.dump(tweet_clf, open("model.pkl", "wb"))