# News Tweets Classifier
Reference:
- https://iq.opengenus.org/text-classification-using-k-nearest-neighbors/
- https://www.geeksforgeeks.org/saving-a-machine-learning-model/

Mount drive

In [10]:
!wget "https://github.com/bernardadhitya/news-tweet-classification/raw/master/news_tweets_labeled.csv"
!wget "https://github.com/bernardadhitya/news-tweet-classification/raw/master/preprocessor.py"

--2021-07-17 11:03:36--  https://github.com/bernardadhitya/news-tweet-classification/raw/master/news_tweets_labeled.csv
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/bernardadhitya/news-tweet-classification/master/news_tweets_labeled.csv [following]
--2021-07-17 11:03:36--  https://raw.githubusercontent.com/bernardadhitya/news-tweet-classification/master/news_tweets_labeled.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1637258 (1.6M) [text/plain]
Saving to: ‘news_tweets_labeled.csv’


2021-07-17 11:03:36 (25.1 MB/s) - ‘news_tweets_labeled.csv’ saved [1637258/1637258]

--2021-07-17 11:03:

Import library

In [11]:
import numpy as np
import pandas as pd
import pickle
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from preprocessor import Preprocessor

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
preprocessor = Preprocessor()

Import and preprocess dataset 

In [13]:
df = pd.read_csv('news_tweets_labeled.csv')
df = df[["text", "category"]]

In [14]:
sw = stopwords.words('english')

wnl = WordNetLemmatizer()
for i in range(df.shape[0]):
  review = re.sub('[^a-zA-Z]', ' ', df.loc[i, 'text'])
  review = review.lower()
  review = review.split()

  review = [wnl.lemmatize(word) for word in review if not word in sw]
  review = ' '.join(review)
  df.loc[i, 'text'] = review

In [15]:
df = preprocessor.prepare_dataset(df)

train_data = df.sample(frac=0.8, random_state=200)
test_data = df.drop(train_data.index)

Build dictionary and transform

In [16]:
# Builds a dictionary of features and transforms document to feature vectors and convert tweets to a 
# matrix of token counts (CountVectorizer)
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(train_data['text'])

# Transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

Train the model and give some new tweets

In [17]:
knn = KNeighborsClassifier(n_neighbors=10, weights='distance')

# Train classifier; train_data['category'] will be having numbers assigned for each category in train data
clf = knn.fit(x_train_tfidf, train_data['category'])

# Input data to predict their classes of the given categories
tweets_new = ["Seven people have been arrested following a search of two vehicles in Hayle Cornwall police have said", 
            "RT MoneyTelegraph Your gas and electricity bills are set to increase by 36pc over the next 10 years",
            "The most common symptom of Covid19 is now a headache say experts as they warned people to get tested even if they think they are not suffering from the illness"]
# building up feature vector of input
x_new_counts = count_vect.transform(tweets_new)
# Call transform instead of fit_transform because it's already been fit
x_new_tfidf = tfidf_transformer.transform(x_new_counts)

In [18]:
# Predicting the category of input text: Will give out number of category
predicted = clf.predict(x_new_tfidf)

for tweet, category in zip(tweets_new, predicted):
  print('%r => %s' %(tweet, category))

'Seven people have been arrested following a search of two vehicles in Hayle Cornwall police have said' => others
'RT MoneyTelegraph Your gas and electricity bills are set to increase by 36pc over the next 10 years' => business
'The most common symptom of Covid19 is now a headache say experts as they warned people to get tested even if they think they are not suffering from the illness' => health


Test the model

In [19]:
# use Pipeline to add vectorizer -> transformer -> classifier all in a one compound classifier
tweet_clf = Pipeline([
  ('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', knn)
])

# Fitting train data to the pipeline
tweet_clf.fit(train_data['text'], train_data['category'])

# Test data
docs_test = test_data['text']

# Predicting test data
predicted = tweet_clf.predict(docs_test)

In [20]:
test_data_temp = preprocessor.prepare_test_dataset(test_data)

In [21]:
predicted_temp = tweet_clf.predict(test_data_temp['text'])
precision, recall, f1_score, _ = precision_recall_fscore_support(test_data_temp['category'], predicted_temp, average='macro')
accuracy = accuracy_score(test_data_temp['category'], predicted_temp)
print("Precision: ", precision * 100 ,"%")
print("Recall: ", recall * 100 ,"%")
print("F1 Score: ", f1_score * 100 ,"%")
print("Accuracy: ", accuracy * 100 ,"%")

Precision:  85.68908733318426 %
Recall:  77.86976691554624 %
F1 Score:  80.17576209745275 %
Accuracy:  82.36363636363636 %


Save the model

In [22]:
pickle.dump(tweet_clf, open("model.pkl", "wb"))