In [1]:
# Necessary Imports
# Code idea from Kaggle Submissions to https://www.kaggle.com/c/learn-ai-bbc/data
# Modeled on https://www.kaggle.com/rmisra/news-category-dataset

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_json("News_Category_Dataset_v2.json", lines=True)
data.to_csv("news_category_train.csv") # Creating a csv version of downloaded data for ease of manual checking

In [3]:
# Choosing select entries to keep training data balanced, relevant and small

politics = data[data.category == "POLITICS"]
politics = politics[0:5000]

sports = data[data.category == "SPORTS"]
sports = sports[0:5000]

wellness = data[data.category == "WELLNESS"]
wellness = wellness[0:2000]
healthy = data[data.category == "HEALTHY LIVING"]
healthy = healthy[0:3000]
health = pd.concat([wellness, healthy])
health.category = "HEALTH"

entertainment = data[data.category == "ENTERTAINMENT"]
entertainment = entertainment[0:5000]

business = data[data.category == "BUSINESS"]
business = business[0:5000]

In [4]:
# Building the comprehensive headline dataset- cleaning

final_data = pd.concat([health, business, politics, sports, entertainment]) 
final_data.drop(columns=['authors', 'link', 'short_description', 'date'], inplace=True)
final_data.category = final_data.category.str.lower()
final_data.headline = final_data.headline.str.lower()
final_data['text'] = final_data.headline
final_data['cat'] = final_data.category
final_data.drop(columns=['category', 'headline'], inplace=True)

In [8]:
final_data.to_csv("News_Category)")

Unnamed: 0,text,cat
124989,why overeating doesn't make you fat,health
124990,14 habits of people with a healthy relationshi...,health
124993,5 things that could be stealing your joy,health
124994,moments make a life,health
124996,fat facts,health
...,...,...
38977,vanessa and nick lachey welcome christmas eve ...,entertainment
38979,remembering george michael with 21 of his grea...,entertainment
38998,the first 'alien: covenant' trailer is a terri...,entertainment
39008,'rogue one' dominates the holiday box office,entertainment


In [9]:
# Factorizing Categorical Values
final_data['cid'] = final_data.cat.factorize()[0]

In [11]:
# Preparing data for logit modeling with TF-IDF
# Parameters obtained from leading Kaggle submissions tackling same task 
# TODO: Modify some param values to check if that changes accuracy (unlikely)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

X = tfidf.fit_transform(final_data.text).toarray()
y = final_data.cid

In [13]:
# Logit Model- Multiclass
# Train Test split

model = LogisticRegression(random_state=0, solver="sag", max_iter=500)
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, final_data.index, test_size=0.2, random_state=0)

In [20]:
# Checking for cross_val accuracy

cross_val_score(model, X, y, scoring="accuracy", cv=3)

array([0.74358047, 0.77154913, 0.76042923])

In [14]:
# Fit on training data - can fit on all data as well since we test on unseen data

model.fit(X_train,y_train)

LogisticRegression(max_iter=500, random_state=0, solver='sag')

In [16]:
# Checking number label corresponds to which category
temp = final_data[['cat', 'cid']].drop_duplicates()
temp

Unnamed: 0,cat,cid
124989,health,0
87,business,1
13,politics,2
80,sports,3
1,entertainment,4


In [19]:
# News sample is sample data with text of tweets from news orgs

news = pd.read_csv("news_sample.csv")
import preprocessor as p
news['text'] = news.text.apply(p.clean)
texts = news.text.tolist()

# Outputs a csv fils with predicted labels to texts supplied

def return_class(i):
    classes = {4: "entertainment", 1: "business", 2: "politics", 3: "sports",  0: "health"}    
    return classes[i]

transformed_text = tfidf.transform(texts)
predictions = model.predict(transformed_text)
news['predictions'] = predictions
news.predictions = news.predictions.apply(return_class)
news.to_csv("out_news_category.csv")