In [1]:
import pandas as pd
df = pd.read_csv('./data/training.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,article,category
0,24644,Global emissions targets will lead to 4C tempe...,CLIMATE
1,115540,Climate sceptics place bets on world cooling d...,SCIENCE
2,113058,This brain parasite can increase your chances ...,SOCIETY
3,79582,In pictures: RSPB's Big Garden Birdwatch - wha...,CLIMATE
4,76416,Duma wants convincing arguments for Kyoto rati...,BUSINESS


In [2]:
list(df.columns.values)

['Unnamed: 0', 'article', 'category']

In [3]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,article,category
0,Global emissions targets will lead to 4C tempe...,CLIMATE
1,Climate sceptics place bets on world cooling d...,SCIENCE
2,This brain parasite can increase your chances ...,SOCIETY
3,In pictures: RSPB's Big Garden Birdwatch - wha...,CLIMATE
4,Duma wants convincing arguments for Kyoto rati...,BUSINESS


In [4]:
df = df[pd.notnull(df['article'])]
df['category_id'] = df['category'].factorize()[0]
category_id_df = df[['category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)
df.head()

Unnamed: 0,article,category,category_id
0,Global emissions targets will lead to 4C tempe...,CLIMATE,0
1,Climate sceptics place bets on world cooling d...,SCIENCE,1
2,This brain parasite can increase your chances ...,SOCIETY,2
3,In pictures: RSPB's Big Garden Birdwatch - wha...,CLIMATE,0
4,Duma wants convincing arguments for Kyoto rati...,BUSINESS,3


In [5]:
df.to_csv('./data/train.csv')

In [7]:
counts = df.category_id.value_counts()
print(counts)

5    22318
3    21567
4    21240
0    14770
2    13873
1    11293
8     8292
6     7364
7     6410
Name: category_id, dtype: int64


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['article'], 
    df['category_id'], 
    random_state = 1
)

In [12]:
print(X_train.dtype)

object


In [13]:
print("Training dataset: ", X_train.shape[0])
print("Test dataset: ", X_test.shape[0])

Training dataset:  95345
Test dataset:  31782


In [14]:
# Bag of words
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer(stop_words = 'english')
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

In [15]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()

X_train_res, y_train_res = sm.fit_sample(training_data, y_train)
# Nothing notable achieved while using SMOTE. Hence, didnt took into consideration

In [18]:
pd.value_counts(pd.Series(y_train_res))

8    16803
7    16803
6    16803
5    16803
4    16803
3    16803
2    16803
1    16803
0    16803
dtype: int64

In [24]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
predictions = naive_bayes.predict(testing_data)
predictions

array([7, 5, 2, ..., 8, 1, 7])

In [26]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print("Accuracy score: ", accuracy_score(y_test, predictions))
print("Recall score: ", recall_score(y_test, predictions, average = 'weighted'))
print("Precision score: ", precision_score(y_test, predictions, average = 'weighted'))
print("F1 score: ", f1_score(y_test, predictions, average = 'weighted'))

Accuracy score:  0.898307217922
Recall score:  0.898307217922
Precision score:  0.902011064172
F1 score:  0.89778400195


In [27]:
from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(y_test, predictions)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))

precision: [ 0.81588032  0.78906014  0.88527948  0.92250701  0.90823845  0.9854307
  0.91671145  0.87507003  0.95906711]
recall: [ 0.9519334   0.90691212  0.87301587  0.85299019  0.76494923  0.98114234
  0.9793341   0.94494858  0.96642686]


In [28]:
import pickle

filename = "./news-classification/model.pkl"
model_pkl = open(filename, 'wb')
pickle.dump(naive_bayes, model_pkl)

In [29]:
import numpy as np
np.save('./news-classification/id_to_category.npy', id_to_category) 

In [30]:
print(id_to_category)

{0: 'CLIMATE', 1: 'SCIENCE', 2: 'SOCIETY', 3: 'BUSINESS', 4: 'SCI-TECH', 5: 'SPORTS', 6: 'ENTERTAINMENT', 7: 'POLITICS', 8: 'EDUCATION'}


In [31]:
data = df.article[0]
data = [data]
vect = count_vector.transform(data)
my_pred = naive_bayes.predict(vect)

In [32]:
print(my_pred)
my_pred = my_pred.tolist()

[0]


In [35]:
id_to_category_load = np.load('./news-classification/id_to_category.npy').item()
print(id_to_category_load[my_pred[0]])

CLIMATE


In [36]:
from sklearn.externals import joblib
joblib.dump(count_vector.vocabulary_, './news-classification/count_vector')

['./news-classification/count_vector']