**Importing the Necessary Libraries**

In [5]:
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


**Reading Dataset as Dataframe**

In [6]:
df = pd.read_csv("News.csv")
pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window

# Assigning numbers to categories of news
df['Category'] = df['Category'].map({'Sport':1, 'Sci/Tech':0})

**Cleaning News Summaries**

In [7]:
# Creating function to clean summaries
def cleaner(summary):
    soup = BeautifulSoup(summary, 'lxml') # removing HTML entities such as â€˜&ampâ€™,â€™&quotâ€™,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(#|@|http://|https://|www)\S*", " ", souped) # substituting hashtags, @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

# Applying cleaner function to summaries
df['cleaned_summary'] = df.Summary.apply(cleaner)

# Removing rows with cleaned summaries of length 0
df = df[df['cleaned_summary'].map(len) > 0] 
print("Printing top 5 rows of dataframe showing original and cleaned summaries....")
print(df[['Summary','cleaned_summary']].head())

# Joining tokens to create strings. TfidfVectorizer does not accept tokens as input
df['cleaned_summary'] = [" ".join(row) for row in df['cleaned_summary'].values] 
data = df['cleaned_summary']
Y = df['Category'] # target column

# min_df=.0005 means that each ngram (unigram, bigram, & trigram) must be present in at least 30 documents for it to be considered as a token (60000*.0005=30). This is a clever way of feature engineering
tfidf = TfidfVectorizer(min_df=.0005, ngram_range=(1,3)) 

# learn vocabulary of entire data
tfidf.fit(data) 

# creating tfidf values
data_tfidf = tfidf.transform(data) 

print("The created tokens: \n", tfidf.get_feature_names())
print("Shape of tfidf matrix: ", data_tfidf.shape)


Printing top 5 rows of dataframe showing original and cleaned summaries....
                                                                                                                                                                                                                                                                  Summary  \
0  Reuters - A group of technology companies\including Texas Instruments Inc. (TXN.N), STMicroelectronics\(STM.PA) and Broadcom Corp. (BRCM.O), on Thursday said they\will propose a new wireless networking standard up to 10 times\the speed of the current generation.   
1                                                                                     Reuters - America Online on Thursday said it\plans to sell a low-priced PC targeting low-income and minority\households who agree to sign up for a year of dialup Internet\service.   
2                     Reuters - A group of consumer electronics\makers said on Wednesday they approved the format for

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The created tokens: 
Shape of tfidf matrix:  (47006, 7184)




**Implementing Support Vector Classifier**

In [8]:
print("Implementing SVC.....")
svc_clf = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    svc_clf.fit(X_train, Y_train) # Fitting SVC
    Y_pred = svc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
svc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", svc_mean_accuracy)


Implementing SVC.....
Iteration  1
Cross-validation accuracy:  0.9868113167411189
Iteration  2
Cross-validation accuracy:  0.9859604339502234
Iteration  3
Cross-validation accuracy:  0.9863858753456711
Iteration  4
Cross-validation accuracy:  0.9874494788342906
Iteration  5
Cross-validation accuracy:  0.9855349925547756
Iteration  6
Cross-validation accuracy:  0.9868113167411189
Iteration  7
Cross-validation accuracy:  0.9825531914893617
Iteration  8
Cross-validation accuracy:  0.9859574468085106
Iteration  9
Cross-validation accuracy:  0.9874468085106383
Iteration  10
Cross-validation accuracy:  0.9863829787234043
Mean cross-validation accuracy:  0.9861293839699113


**Implementing Naive Bayes Classifier**

In [9]:
print("Implementing NBC.....")
nbc_clf = MultinomialNB()

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    nbc_clf.fit(X_train, Y_train) # Fitting NBC
    Y_pred = nbc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
nbc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", nbc_mean_accuracy)


Implementing NBC.....
Iteration  1
Cross-validation accuracy:  0.9857477132524994
Iteration  2
Cross-validation accuracy:  0.9821314613911933
Iteration  3
Cross-validation accuracy:  0.9802169751116784
Iteration  4
Cross-validation accuracy:  0.9834077855775367
Iteration  5
Cross-validation accuracy:  0.9831950648798128
Iteration  6
Cross-validation accuracy:  0.9825569027866411
Iteration  7
Cross-validation accuracy:  0.9814893617021276
Iteration  8
Cross-validation accuracy:  0.9829787234042553
Iteration  9
Cross-validation accuracy:  0.985531914893617
Iteration  10
Cross-validation accuracy:  0.9842553191489362
Mean cross-validation accuracy:  0.9831511222148297
