In [1]:
## import statements ##
import numpy as np
import pandas as pd
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, f1_score, roc_auc_score

import warnings; warnings.simplefilter('ignore')

**Import Data from XLS file**

Using pandas to import the dataset and transform into a Dataframe.

In [2]:
data_files = 'News Title.xls'

data = pd.read_excel(data_files)
train_data = data[['News Title', 'Category']]
train_data.head()

Unnamed: 0,News Title,Category
0,Google+ rolls out 'Stories' for tricked out ph...,Technology
1,Dov Charney's Redeeming Quality,Business
2,White God adds Un Certain Regard to the Palm Dog,Entertainment
3,"Google shows off Androids for wearables, cars,...",Technology
4,China May new bank loans at 870.8 bln yuan,Business


**Data Cleansing Step**

1. Remove any symbols in the comments
2. Convert to lowercase

In [3]:
def process_content(content):
    return " ".join(re.findall("[A-Za-z]+",content.lower()))

In [4]:
train_data['processed_title'] = train_data['News Title'].apply(process_content)

In [5]:
train_data.head()

Unnamed: 0,News Title,Category,processed_title
0,Google+ rolls out 'Stories' for tricked out ph...,Technology,google rolls out stories for tricked out photo...
1,Dov Charney's Redeeming Quality,Business,dov charney s redeeming quality
2,White God adds Un Certain Regard to the Palm Dog,Entertainment,white god adds un certain regard to the palm dog
3,"Google shows off Androids for wearables, cars,...",Technology,google shows off androids for wearables cars tvs
4,China May new bank loans at 870.8 bln yuan,Business,china may new bank loans at bln yuan


In [6]:
# encoder = LabelEncoder()
# y = encoder.fit_transform(train_data['Category'])
# train_data['n_category'] = y
# print(y[:5])

**Data Distribution**

In [7]:
categories = train_data['Category']
titles = train_data['processed_title']
N = len(titles)
print('Number of news',N)

Number of news 65535


In [8]:
labels = list(set(categories))
n_classes = len(labels)
print('possible categories',labels)

possible categories ['Technology', 'Medical', 'Business', 'Entertainment']


In [9]:
for l in labels:
    print('number of ',l,' news',len(train_data.loc[train_data['Category'] == l]))

number of  Technology  news 16776
number of  Medical  news 7091
number of  Business  news 17707
number of  Entertainment  news 23961


**Split the data**

Split the data for data training of 80% and data testing of 20% with random pick of 57.
- Data Training : `X_train & y_train`
- Data Testing : `X_test & y_test`

In [10]:
X_train, X_test, y_train, y_test = train_test_split(train_data['processed_title'],train_data['Category'],test_size=0.2,random_state=57)

**Data Pipeline**

1. Data Cleansing using `CountVectorize` with stop_word='english' to remove the stop word
2. Data transforming using `TF-IDF`
3. Model Training using `LogisticRegression`

In [11]:
model = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression()),
                     ])

**Fitting the model**

In [12]:
text_clf = model.fit(X_train, y_train)

**Predict the data testing**

In [13]:
predicted = model.predict(X_test)

**Confusion Matrix**

In [14]:
confusion_matrix(y_test,predicted)

array([[3114,  120,   34,  213],
       [  62, 4668,   16,   48],
       [ 110,  114, 1165,   41],
       [ 218,  141,   12, 3031]])

**Accuracy Score & Classification Report**

In [15]:
print('accuracy_score',accuracy_score(y_test,predicted))
print('Reporting...')

accuracy_score 0.9138628213931487
Reporting...


In [16]:
print(classification_report(y_test, predicted, target_names=labels))

               precision    recall  f1-score   support

   Technology       0.89      0.89      0.89      3481
      Medical       0.93      0.97      0.95      4794
     Business       0.95      0.81      0.88      1430
Entertainment       0.91      0.89      0.90      3402

  avg / total       0.91      0.91      0.91     13107



**Model Evaluation using Cross Validation**

On the data training

In [17]:
cross_val_score(model, X_train, y_train, cv=5)

array([0.90779939, 0.90987984, 0.90624702, 0.90948975, 0.91024418])

On the data testing

In [18]:
cross_val_score(model, X_test, y_test, cv=5)

array([0.86275257, 0.86384439, 0.85959557, 0.86760778, 0.86259542])

**`predict_title` function**

In [19]:
def predict_title(model, new_data):
    test_data = pd.DataFrame(new_data, columns=['News Title'])
    test_data['processed_title'] = test_data['News Title'].apply(process_content)
    
    X_test = test_data['processed_title']
    predictions = model.predict(X_test)
    
    return predictions

**New Data Sample Test**

In [20]:
t1 = ['Rupiah is the best in Asia today.']
news_title = pd.DataFrame(t1, columns=['News Title'])

In [21]:
predict_title(model, t1)

array(['Business'], dtype=object)