In [2]:
# Necessary Imports
# Code idea from Kaggle Submissions to https://www.kaggle.com/c/learn-ai-bbc/data
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [9]:
# Labeled News Headlines Data from https://www.kaggle.com/c/learn-ai-bbc/data
reg_data = pd.read_csv("BBC News Train.csv")  # Regular Data 

# Since BBC Headlines are missing "health" headlines, these are some tweets
# from news org twitter handles that exhibit health articles
health_data = pd.read_csv("health_headlines.csv")  # Health Data

In [10]:
# Constructing a single usable dataframe by cleaning and formatting the above two
health_data.drop(columns="Unnamed: 0", inplace=True)

# Single combined DataFrame
data = pd.concat([health_data, reg_data]) 

In [11]:
data

Unnamed: 0,Text,Category
0,Mayor Bill de Blasio has proposed an ambitious...,health
1,Troubling. (SARS-CoV-2 was created in a lab. I...,health
2,Warnings from top health officialsFauci says v...,health
3,Wouldn't it make sense for the world to priori...,health
4,When Oregon public health workers got stuck in...,health
...,...,...
1521,There are still too many families who are just...,health
1522,This is the last GDP report from Trumps tenure...,health
1523,"Paid time off to get vaccinated, featuring and...",health
1524,According to researchers from the Centers for ...,health


In [14]:
# Preprocessing and cleaning data
data.Category = data.Category.str.lower()
data.Text = data.Text.str.lower()
data['Cat_id'] = data.Category.factorize()[0]
data.dropna(inplace=True)

In [15]:
data.Cat_id.value_counts() # Checking for factorization correctness

4    346
1    336
3    274
5    273
2    261
0    163
Name: Cat_id, dtype: int64

In [16]:
data.Category.value_counts() # Checking for factorization correctness

sport            346
business         336
politics         274
entertainment    273
tech             261
health           163
Name: Category, dtype: int64

In [None]:
# classes = {4: "sport", 1: "business", 3: "politics", 5: "entertainment", 2:"tech", 0: "health"} # List of classes in the labels

In [17]:
# Preparing data for logit modeling with TF-IDF
# Parameters obtained from leading Kaggle submissions tackling same task 
# TODO: Modify some param values to check if that changes accuracy (unlikely)
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

X = tfidf.fit_transform(data.Text).toarray()
y = data.Cat_id

In [18]:
# Logit Model- Multiclass
model = LogisticRegression(random_state=0, solver="saga")

In [19]:
# Checking for cross_val accuracy
cross_val_score(model, X, y, scoring="accuracy", cv=5)

array([0.97583082, 0.95770393, 0.98489426, 0.96969697, 0.97878788])

In [21]:
# Fit on training data - can fit on all data as well since we test on unseen data
model.fit(X,y)

LogisticRegression(random_state=0, solver='saga')

In [22]:
# News sample is sample data with text of tweets from news orgs

news = pd.read_csv("news_sample.csv")

In [23]:
# Cleaning News Text to remove handles, links etc.

import preprocessor as p
news['text'] = news.text.apply(p.clean)

In [25]:
# Outputs a csv fils with predicted labels to texts supplied

def return_class(i):
    classes = {4: "sports", 1: "business", 3: "politics", 5: "entertainment", 2:"tech", 0: "health"}    
    return classes[i]

transformed_text = tfidf.transform(texts)
predictions = model.predict(transformed_text)
news['predictions'] = predictions
news.predictions = news.predictions.apply(return_class) 
news.to_csv("out_bbc.csv")