# News Category Prediction

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import pickle

In [2]:
df=pd.read_csv("Data_Train.csv")

In [3]:
df.head()

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ‘...",1
4,"In global markets, gold prices edged up today ...",3


In [4]:
df["SECTION"].nunique()

4

In [5]:
df["SECTION"].unique()

array([3, 0, 1, 2])

In [6]:
df.shape

(7628, 2)

In [7]:
# Cleaning the text present in the feature column

def story_cleaned(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z]'," ",text)
    text = re.sub(r'\\n'," ",text)
    text = re.sub(r'\W'," ",text)
    text = re.sub(r'https\s+|www.\s+'," ",text)
    text = re.sub(r'http\s+|www.\s+'," ",text)
    text = re.sub(r'\s+[a-zA-Z]\s+'," ",text)
    text = re.sub(r'\^[a-zA-Z]\s+'," ",text)
    text = re.sub(r"won\'t", "will not",text)
    text = re.sub(r'can\'t', "can not",text)
    text = re.sub(r'don\'t', "do not",text)
    text = re.sub(r'dont', "do not",text)
    text = re.sub(r'n\’t', " not",text)
    text = re.sub(r'n\'t', " not",text)
    text = re.sub(r'\'re', " are",text)
    text = re.sub(r'\'s', " is",text)
    text = re.sub(r'\’d', " would",text)
    text = re.sub(r'\d', " would",text)
    text = re.sub(r'\'ll', " will",text)
    text = re.sub(r'\'t', " not",text)                  
    text = re.sub(r'\'ve', " have",text)
    text = re.sub(r'\'m', " am",text)
    text = re.sub(r'\n', "",text)
    text = re.sub(r'\r', "",text)
    text = re.sub(r'[0-9]', "digit",text)
    text = re.sub(r"\'", "",text)
    text = re.sub(r"\"", "",text)
    text = re.sub(r'[?|!|\'|"|#]',r'',text)
    text = re.sub(r'[.|,|)|(|\|/]',r' ',text)    
    text = re.sub(r'\s+', ' ',text)
    return text

df["STORY"] = df["STORY"].apply(story_cleaned)

In [8]:
df.head()

Unnamed: 0,STORY,SECTION
0,but the most painful was the huge reversal in ...,3
1,how formidable is the opposition alliance amon...,0
2,most asian currencies were trading lower today...,3
3,if you want to answer any question click on an...,1
4,in global markets gold prices edged up today a...,3


In [9]:
X = df.drop(labels = ["SECTION"],axis =1)
y = df["SECTION"].values

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold,GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score , f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [11]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [12]:
tf1 = TfidfVectorizer(ngram_range=(1,2),stop_words = 'english' , token_pattern = r'\w{3,}')

tfidf_train_story = tf1.fit_transform(X_train["STORY"])
tfidf_test_story = tf1.transform(X_test["STORY"])

In [13]:
# Generating pickle file for tf-idf
pickle.dump(tf1, open('tfidf-transform.pkl', 'wb'))

In [14]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC

In [15]:
best_params = {'alpha': 0.5}
nb = MultinomialNB(**best_params)
nb.fit(tfidf_train_story, y_train)


MultinomialNB(alpha=0.5)

In [16]:
# Creating a pickle file for the Multinomial Naive Bayes model
filename = 'news-category-mnb-model.pkl'
pickle.dump(nb, open(filename, 'wb'))