# EECS 487 Final Project
## Ideology Detection

In [1]:
%load_ext autoreload
%autoreload 2

import pickle

import pandas as pd
from sklearn.model_selection import train_test_split

from ffnn import *
from naive_bayes import *

## Load the Data

In [2]:
df = load_json()
df = df.astype({'bias': 'int16'})

In [3]:
train, validation = train_test_split(df, test_size=0.1)
train, test = train_test_split(train, test_size=0.2)
train, validation, test = train.reset_index(drop=True), validation.reset_index(drop=True), test.reset_index(drop=True)

In [4]:
get_article_bias_stats(train)

Average number of token: 1097.0009246246025
Standard deviation: 758.9292906502244
Number of articles in each category: {0: 9402, 1: 7784, 2: 9852}


In [5]:
get_article_bias_stats(validation)

Average number of token: 1084.0034611288604
Standard deviation: 704.5247763784831
Number of articles in each category: {0: 1291, 1: 1080, 2: 1385}


## Naive Bayes

In [6]:
naive_bayes = NaiveBayes()
naive_bayes.fit(train)
print(f"Probability for each category: {naive_bayes.category_prob}")
print(f"Total count for each category: {naive_bayes.total_count}")

Probability for each category: [0.3477328204748872, 0.28789111620682, 0.36437606331829275]
Total count for each category: [17179311, 12701196, 13401076]


In [22]:
alpha = 0.5
predictions = naive_bayes.predict(test.text.tolist(), alpha)
labels = test.bias.tolist()
accuracy, mac_f1, mic_f1 = evaluate(predictions, labels)
print(f"Accuracy: {accuracy}")
print(f"Macro f1: {mac_f1}")
print(f"Micro f1: {mic_f1}")

Accuracy: 0.6655325443786982
Macro f1: 0.6627561809953031
Micro f1: 0.6655325443786982


# Feed Forward Neural Network

In [54]:
ffnn = FFNN('tfidf')

trn_tfidf = ffnn.fit_tfidf_feature(train)
print(trn_tfidf.shape)

(27038, 68433)


In [57]:
tfidf_hyperparameter = ffnn.cross_validation(trn_tfidf, train.bias)
print(tfidf_hyperparameter)

100%|██████████| 3/3 [1:35:40<00:00, 1913.53s/it]

{'layer_sizes': (100, 200), 'learning_rate': 0.01, 'alpha': 0.001}





In [58]:
ffnn.fit(trn_tfidf, train.bias, tfidf_hyperparameter)
ffnn.clf

MLPClassifier(alpha=0.001, hidden_layer_sizes=(100, 200),
              learning_rate_init=0.01, tol=0.1)

In [61]:
accuracy, f1 = ffnn.test_performance(test)
print("Accuracy:", accuracy)
print("f1:", f1)

Accuracy: 0.6880177514792899
f1: 0.6850134393221209


In [62]:
ffnn = FFNN('word2vec')

trn_word2vec = ffnn.get_word2vec_feature(train)
print(trn_word2vec[0, :10])
trn_word2vec.shape

[====----------------------------------------------] 9.4% 156.3/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[ 0.03956318  0.02092802  0.02915509  0.08028916 -0.05218758 -0.06083317
  0.00092557 -0.07721347  0.08180851  0.05255482]


(27038, 300)

In [64]:
word2vec_hyperparameter = ffnn.cross_validation(trn_word2vec, train.bias)
print(word2vec_hyperparameter)

100%|██████████| 3/3 [00:44<00:00, 14.87s/it]

{'layer_sizes': (100, 150), 'learning_rate': 0.001, 'alpha': 0.01}





In [66]:
ffnn.fit(trn_word2vec, train.bias, word2vec_hyperparameter)
print(ffnn.clf)
accuracy, f1 = ffnn.test_performance(test)
print("Accuracy:", accuracy)
print("f1:", f1)

MLPClassifier(alpha=0.01, hidden_layer_sizes=(100, 150), tol=0.1)
Accuracy: 0.5701183431952662
f1: 0.5702056626468497
