In [27]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.metrics import accuracy_score, confusion_matrix

In [11]:
fake_news = pd.read_csv("data/news.csv")
fake_news = fake_news.loc[:, ~fake_news.columns.str.contains("^Unnamed")]

In [12]:
# shows that the classes are balanced
fake_news["label"].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [13]:
X = fake_news[["title", "text"]]
y = fake_news["label"]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X["text"], y, test_size=0.2, random_state=1)

In [24]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
X_train_vectorized = tfidf_vectorizer.fit_transform(X_train)

4541    October 31, 2016 - Fort Russ -  Aleksandr Khro...
2490    October 28, 2016 112 While the Western press c...
4928    While investigators try to piece together what...
5627    Hillary Clinton appears to have scared away mu...
757     Embarrassing and infuriating: the NBC anchor’s...
Name: text, dtype: object

### Transform the X_test with the fitted Tf_Idf weights

In [26]:
X_test_vectorized = tfidf_vectorizer.transform(X_test)

### Train a Passive Aggressive Classifier

In [45]:
pac = PassiveAggressiveClassifier(max_iter=50, tol=1e-3)
pac.fit(X_train_vectorized, y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              early_stopping=False, fit_intercept=True, loss='hinge',
              max_iter=50, n_iter=None, n_iter_no_change=5, n_jobs=None,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [49]:
y_pred = pac.predict(X_test_vectorized)
score = accuracy_score(y_test, y_pred)
print("Accuracy:", round(score*100, 2))

Accuracy: 94.63
