In [1]:
import nltk
import numpy as np
import operator
import pandas as pd
import pickle
import pydotplus
import re
import seaborn as sns
import sys
import time
import warnings
import zipfile

from collections import (Counter)
from imblearn.under_sampling \
    import (RandomUnderSampler,
            ClusterCentroids,
            TomekLinks)
from imblearn.over_sampling \
    import (RandomOverSampler,
            SMOTE)
from io import (StringIO)
from IPython.display import (Image)
from matplotlib import (pyplot as plt)
from mlxtend.classifier import (StackingClassifier)
from nltk.corpus \
    import (stopwords)
from nltk.tokenize \
    import (word_tokenize,
            sent_tokenize)
from nltk.stem \
    import (WordNetLemmatizer)
from scipy import (stats)
from scipy.cluster import (hierarchy as sch)
from sklearn.cluster \
    import (KMeans,
            AgglomerativeClustering)
from sklearn.datasets import (make_moons)
from sklearn.decomposition import (PCA) 
from sklearn.ensemble \
    import (AdaBoostClassifier,
            BaggingClassifier,
            ExtraTreesClassifier,
            GradientBoostingClassifier,
            RandomForestClassifier,
            VotingClassifier)
from sklearn.feature_extraction.text \
    import (CountVectorizer,
            TfidfVectorizer)
from sklearn.feature_selection \
    import (chi2,
            f_regression,
            f_classif,
            RFE,
            SelectFromModel,
            SelectKBest,
            SelectPercentile)
from sklearn.linear_model \
    import (Lasso,
            LogisticRegression,
            LinearRegression,
            Ridge)
from sklearn.metrics \
    import (accuracy_score,
            auc,
            classification_report,
            confusion_matrix,
            f1_score,
            precision_score,
            precision_recall_fscore_support as error_metric,
            r2_score,
            recall_score,
            roc_auc_score,
            roc_curve)
from sklearn.model_selection \
    import (GridSearchCV,
            RandomizedSearchCV,
            train_test_split)
from sklearn.multiclass import (OneVsRestClassifier)
from sklearn.naive_bayes import (MultinomialNB)
from sklearn.pipeline import (make_pipeline)
from sklearn.preprocessing \
    import (LabelEncoder,
            MinMaxScaler,
            StandardScaler)
from sklearn.svm \
    import (LinearSVC,
            LinearSVC,
            SVC)
from sklearn.tree \
    import (DecisionTreeClassifier,
            DecisionTreeRegressor)
from string import (punctuation)
from xgboost import (XGBClassifier)

warnings.filterwarnings("ignore")
np.set_printoptions(threshold=sys.maxsize)



In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd '/content/drive/MyDrive/DS_Work/News Clf Proper'

/content/drive/MyDrive/DS_Work/News Clf Proper


In [5]:
PATH_DATA = "Data/newsCorpora_edited.csv"

In [6]:
data = pd.read_csv(PATH_DATA, sep='\t')
data.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422419 entries, 0 to 422418
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ID         422419 non-null  int64 
 1   TITLE      422419 non-null  object
 2   URL        422419 non-null  object
 3   PUBLISHER  422417 non-null  object
 4   CATEGORY   422419 non-null  int64 
 5   STORY      422419 non-null  object
 6   HOSTNAME   422419 non-null  object
 7   TIMESTAMP  422419 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 25.8+ MB


In [7]:
# b: 'business',
# t: 'science and tech',
# e: 'entertainment',
# m: 'health'
# Mapping targets to integers

data['CATEGORY'] = data['CATEGORY'].map({'b': 0, 't': 1, 'e': 2, 'm':3})

In [8]:
# Removing punctuation and symbols
data['TITLE'] = data['TITLE'].apply(lambda x: re.sub("[^a-z ]", "", x.lower()))

In [9]:
# Removing stopwords
stop = set(stopwords.words('english'))
data['TITLE'] = data['TITLE'].apply(lambda sentence: ' '.join([word for word in sentence.split() 
                                                               if word not in stop]))

In [10]:
# Creating holdout set
data_train = data.sample(frac=0.7, random_state=42).copy()
data_holdout = data[ data.index.isin(data_train.index) == False ].copy()

In [12]:
start_time = time.time()

print('Time Taken: {0} seconds'.format(time.time() - start_time))

Time Taken: 4.839897155761719e-05 seconds


In [13]:
X_train, X_test, y_train, y_test = train_test_split(data_train['TITLE'], data_train['CATEGORY'],
                                                    test_size=0.2, stratify=data_train['CATEGORY'],
                                                    random_state=42)

In [14]:
count_vectorizer = CountVectorizer()

X_train_countvec = count_vectorizer.fit_transform(X_train, y_train)
X_test_countvec = count_vectorizer.transform(X_test)

nb_countvec = MultinomialNB()

start_time = time.time()
nb_countvec.fit(X_train_countvec, y_train)
print('Time Taken: {0} seconds'.format(time.time() - start_time))

nb_countvec_pred = nb_countvec.predict(X_test_countvec)

acc_countvec_nb = accuracy_score(nb_countvec_pred, y_test)
print(acc_countvec_nb)

Time Taken: 0.08130812644958496 seconds
0.9260048360641877


In [15]:
pred_req = count_vectorizer.transform(data_holdout['TITLE'])
accuracy_score(data_holdout['CATEGORY'], nb_countvec.predict(pred_req))

0.925374429872323

In [16]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train, y_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

nb_tfidf = MultinomialNB()
start_time = time.time()
nb_tfidf.fit(X_train_tfidf, y_train)
print('Time Taken: {0} seconds'.format(time.time() - start_time))
nb_tfidf_pred = nb_tfidf.predict(X_test_tfidf)

acc_tfidf_nb = accuracy_score(nb_tfidf_pred, y_test)
print(acc_tfidf_nb)

Time Taken: 0.338397741317749 seconds
0.9238742623311182


In [17]:
pred_req = tfidf_vectorizer.transform(data_holdout['TITLE'])
accuracy_score(data_holdout['CATEGORY'], nb_tfidf.predict(pred_req))

0.9234411249467355

In [18]:
lr_countvec = OneVsRestClassifier(LogisticRegression(random_state=10))

start_time = time.time()
lr_countvec.fit(X_train_countvec, y_train)
print('Time Taken: {0} seconds'.format(time.time() - start_time))

lr_countvec_pred = lr_countvec.predict(X_test_countvec)
acc_lr_countvec = accuracy_score(lr_countvec_pred, y_test)

print(acc_lr_countvec)

Time Taken: 20.64923357963562 seconds
0.9418995924855003


In [19]:
pred_req = count_vectorizer.transform(data_holdout['TITLE'])
accuracy_score(data_holdout['CATEGORY'], lr_countvec.predict(pred_req))

0.9422770386503164

In [20]:
lr_tfidf = OneVsRestClassifier(LogisticRegression(random_state=10))

start_time = time.time()
lr_tfidf.fit(X_train_tfidf, y_train)
print('Time Taken: {0} seconds'.format(time.time() - start_time))
lr_tfidf_pred = lr_tfidf.predict(X_test_tfidf)

acc_lr_tfidf = accuracy_score(lr_tfidf_pred, y_test)
print(acc_lr_tfidf)

Time Taken: 184.48530793190002 seconds
0.9346962241498842


In [None]:
pred_req = tfidf_vectorizer.transform(data_holdout['TITLE'])
accuracy_score(data_holdout['CATEGORY'], lr_tfidf.predict(pred_req))

0.9346779666366807

---