In [1]:
import numpy as np
import pandas as pd
import csv

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cp -r drive/MyDrive/smart-news-website/ /content/smart-news-website

In [4]:
%cd smart-news-website/

/content/smart-news-website


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
fake_news = pd.read_csv('/content/smart-news-website/FakeNews/Fake.csv')
true_news = pd.read_csv('/content/smart-news-website/FakeNews/True.csv')

In [None]:
fake_news = fake_news['text']
true_news = true_news['text']

fake_news

0        Donald Trump just couldn t wish all Americans ...
1        House Intelligence Committee Chairman Devin Nu...
2        On Friday, it was revealed that former Milwauk...
3        On Christmas day, Donald Trump announced that ...
4        Pope Francis used his annual Christmas Day mes...
                               ...                        
23476    21st Century Wire says As 21WIRE reported earl...
23477    21st Century Wire says It s a familiar theme. ...
23478    Patrick Henningsen  21st Century WireRemember ...
23479    21st Century Wire says Al Jazeera America will...
23480    21st Century Wire says As 21WIRE predicted in ...
Name: text, Length: 23481, dtype: object

In [None]:
from data import PreProcessor

preproc = PreProcessor()

fake_news = fake_news.apply(preproc.forward)
true_news = true_news.apply(preproc.forward)

fake_news.head()

0    donald trump wish americans happy new year lea...
1    house intelligence committee chairman devin nu...
2    friday -PRON- reveal former milwaukee sheriff ...
3    christmas day donald trump announce -PRON- wou...
4    pope francis use -PRON- annual christmas day m...
Name: text, dtype: object

In [None]:
fake_news = pd.DataFrame(fake_news)
true_news = pd.DataFrame(true_news)

fake_news = fake_news.assign(is_fake = [1]*len(fake_news))
true_news = true_news.assign(is_fake = [0]*len(true_news))

In [None]:
fake_news.head()

Unnamed: 0,text,is_fake
0,donald trump wish americans happy new year lea...,1
1,house intelligence committee chairman devin nu...,1
2,friday -PRON- reveal former milwaukee sheriff ...,1
3,christmas day donald trump announce -PRON- wou...,1
4,pope francis use -PRON- annual christmas day m...,1


In [None]:
fake_news.to_csv('Fake_news_preprocessed.csv')
true_news.to_csv('True_news_preprocessed.csv')

## Checkpoint Preprocessing done

In [5]:
import pandas as pd
fake_news = pd.read_csv('/content/smart-news-website/Fake_news_preprocessed.csv')
true_news = pd.read_csv('/content/smart-news-website/True_news_preprocessed.csv')

In [6]:
fake_news.dropna(axis=0, inplace=True)
true_news.dropna(axis=0, inplace=True)

In [7]:
from sklearn.model_selection import train_test_split

train_fake_X, t_fake_X, train_fake_y, t_fake_y = train_test_split(fake_news.drop('is_fake', axis=1), fake_news['is_fake'], test_size=0.2, random_state=101)
test_fake_X, val_fake_X, test_fake_y, val_fake_y = train_test_split(t_fake_X, t_fake_y, test_size=0.5, random_state=101)

train_true_X, t_true_X, train_true_y, t_true_y = train_test_split(true_news.drop('is_fake', axis=1), true_news['is_fake'], test_size=0.2, random_state=101)
test_true_X, val_true_X, test_true_y, val_true_y = train_test_split(t_true_X, t_true_y, test_size=0.5, random_state=101)

In [8]:
# Joining fake and true dataframes

train_X = pd.concat([train_fake_X, train_true_X])
val_X = pd.concat([val_fake_X, val_true_X])
test_X = pd.concat([test_fake_X, test_true_X])

train_y = pd.concat([train_fake_y, train_true_y])
val_y = pd.concat([val_fake_y, val_true_y])
test_y = pd.concat([test_fake_y, test_true_y])

train_y = pd.DataFrame(train_y)
val_y = pd.DataFrame(val_y)
test_y = pd.DataFrame(test_y)

In [9]:
train_X = train_X['text']
val_X = val_X['text']
test_X = test_X['text']

In [10]:
train_X.shape

(35412,)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 50000, lowercase=False, ngram_range=(1,2))
train_X_vec = vectorizer.fit_transform(train_X)
val_X_vec = vectorizer.transform(val_X)
test_X_vec = vectorizer.transform(test_X)

In [12]:
#import joblib

#joblib.dump(vectorizer, 'tfidf_for_fakenews.pkl')

In [23]:
train_X_vec[:2048].toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
batch_idx = 0
batch_size = 1024


In [13]:
!pwd

/content/smart-news-website


In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    
    def __init__(self, input_features=50000, hidden1=512, hidden2=128, hidden3=32, output_features=2):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(input_features, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, hidden3)
        self.out = nn.Linear(hidden3, output_features)

    def forward(self, x):

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.out(x)
        
        return x

In [18]:
fake_model = Model()
fake_model.parameters

<bound method Module.parameters of Model(
  (fc1): Linear(in_features=50000, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=32, bias=True)
  (out): Linear(in_features=32, out_features=2, bias=True)
)>

In [None]:
epochs = 50
learning_rate = 1e-3
batch_idx = 0
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
batch_size = 1024

final_losses = []

for i in range(epochs):

    batch_X = 

    while True:
        
        y_pred = model.forward(batch_X)