In [1]:
# Importing required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

print('Setup Complete')

Setup Complete


In [2]:
# Reading data
data = pd.read_csv('data/news.csv', index_col='id')
news_data = pd.DataFrame(data, columns = ['text', 'label'])

print(news_data.shape)
news_data.head()

(20800, 2)


Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


### Data Preprocessing

In [3]:
news_data.isnull().sum()

text     39
label     0
dtype: int64

In [4]:
news_data.dropna(inplace = True)
news_data.isnull().sum()

text     0
label    0
dtype: int64

In [5]:
labels = news_data.label
labels.head()

id
0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [6]:
# Splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(news_data.text, labels, test_size = 0.2, random_state = 1)

## TF-IDF Vectorizer

In [7]:
# Implementing TF-IDF Vectorizer and transforming train and test data
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.8)

train_data = vectorizer.fit_transform(x_train)
test_data = vectorizer.transform(x_test)

## Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state = 1, n_estimators = 500, max_depth = 20)
model.fit(train_data, y_train)

y_predicted = model.predict(test_data)

In [9]:
# Accuracy
acc_score = accuracy_score(y_test, y_predicted)
print(f'Accuracy: {round(acc_score * 100, 2)}%')

# Confusion Matrix
mat = confusion_matrix(y_test, y_predicted, labels = [1, 0])
print(mat)

# F1 Score
f1Score = f1_score(y_test, y_predicted)
print(f'F1 Score: {round(f1Score * 100, 2)}%')

Accuracy: 89.89%
[[1842  196]
 [ 224 1891]]
F1 Score: 89.77%


## Linear SVM

In [10]:
from sklearn.svm import LinearSVC

linear_svc = LinearSVC(random_state = 1)
linear_svc.fit(train_data, y_train)

y_prd = linear_svc.predict(test_data)

In [11]:
# Accuracy
acc_score = accuracy_score(y_test, y_prd)
print(f'Accuracy: {round(acc_score * 100, 2)}%')

# Confusion Matrix
mat = confusion_matrix(y_test, y_prd, labels = [1, 0])
print(mat)

# F1 Score
f1Score = f1_score(y_test, y_prd)
print(f'F1 Score: {round(f1Score * 100, 2)}%')

Accuracy: 96.08%
[[1972   66]
 [  97 2018]]
F1 Score: 96.03%


## Passive Aggressive Classifier

In [12]:
# Applying Passive Aggressive Classifier to classify news
from sklearn.linear_model import PassiveAggressiveClassifier
classifier = PassiveAggressiveClassifier(max_iter = 100, random_state = 1)

classifier.fit(train_data, y_train)
y_pred = classifier.predict(test_data)

In [13]:
# Accuracy
acc_score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(acc_score * 100, 2)}%')

# Confusion Matrix
mat = confusion_matrix(y_test, y_pred, labels = [1, 0])
print(mat)

# F1 Score
from sklearn.metrics import f1_score
f1Score = f1_score(y_test, y_pred)

print(f'F1 Score: {round(f1Score * 100, 2)}%')

Accuracy: 96.41%
[[1974   64]
 [  85 2030]]
F1 Score: 96.36%
