In [1]:
# !nvidia-smi

### **Mount Drive**

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


### **Classification Metrics**
The **Matthews correlation coefficient (MCC)**, instead, is a more reliable statistical rate which produces a high score only if the prediction obtained good results in all of the four confusion matrix categories (true positives, false negatives, true negatives, and false positives), proportionally both to the size of positive elements and the size of negative elements in the dataset.

In [3]:
!pip install joblib


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import joblib


In [5]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


About the Dataset:

1. id: unique id for a news article
2. title: the title of a news article
3. author: author of the news article
4. text: the text of the article; could be incomplete
5. label: a label that marks whether the news article is real or fake:
           1: Fake news
           0: real News





In [6]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/content/drive/MyDrive/Syntax Error 2023/train.csv')

In [10]:
news_dataset.shape

(20800, 5)

In [11]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [12]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [13]:
news_dataset.drop('id', axis = 1, inplace = True)
news_dataset.drop('author', axis = 1, inplace = True)

In [14]:
news_dataset

Unnamed: 0,title,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Rapper T. I. unloaded on black celebrities who...,0
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",When the Green Bay Packers lost to the Washing...,0
20797,Macy’s Is Said to Receive Takeover Approach by...,The Macy’s of today grew from the union of sev...,0
20798,"NATO, Russia To Hold Parallel Exercises In Bal...","NATO, Russia To Hold Parallel Exercises In Bal...",1


In [15]:
X = news_dataset['text']
Y = news_dataset['label'].values

In [16]:
X

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        Ever get the feeling your life circles the rou...
2        Why the Truth Might Get You Fired October 29, ...
3        Videos 15 Civilians Killed In Single US Airstr...
4        Print \nAn Iranian woman has been sentenced to...
                               ...                        
20795    Rapper T. I. unloaded on black celebrities who...
20796    When the Green Bay Packers lost to the Washing...
20797    The Macy’s of today grew from the union of sev...
20798    NATO, Russia To Hold Parallel Exercises In Bal...
20799      David Swanson is an author, activist, journa...
Name: text, Length: 20800, dtype: object

Stemming:

Stemming is the process of reducing a word to its Root word

example:
actor, actress, acting --> act

In [17]:

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize




In [18]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [19]:
#Lemmitization process

#1 is tokenizing
news_dataset['tokenized'] = news_dataset['text'].apply(word_tokenize)
#2 is tagging
news_dataset['pos_tags'] = news_dataset['tokenized'].apply(nltk.tag.pos_tag)
#3 is simplifying the tagging
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
news_dataset['wordnet_pos'] = news_dataset['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
#4 calling lemmatizer
wnl = WordNetLemmatizer()
news_dataset['lemmatized'] = news_dataset['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
#5 joining the tokenized words to single string
news_dataset['Article'] = [' '.join(map(str, l)) for l in news_dataset['lemmatized']]
news_dataset.head()

Unnamed: 0,title,text,label,tokenized,pos_tags,wordnet_pos,lemmatized,Article
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,"[House, Dem, Aide, :, We, Didn, ’, t, Even, Se...","[(House, NNP), (Dem, NNP), (Aide, NNP), (:, :)...","[(House, n), (Dem, n), (Aide, n), (:, n), (We,...","[House, Dem, Aide, :, We, Didn, ’, t, Even, Se...",House Dem Aide : We Didn ’ t Even See Comey ’ ...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,"[Ever, get, the, feeling, your, life, circles,...","[(Ever, RB), (get, VB), (the, DT), (feeling, N...","[(Ever, r), (get, v), (the, n), (feeling, n), ...","[Ever, get, the, feeling, your, life, circle, ...",Ever get the feeling your life circle the roun...
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,"[Why, the, Truth, Might, Get, You, Fired, Octo...","[(Why, WRB), (the, DT), (Truth, NN), (Might, N...","[(Why, n), (the, n), (Truth, n), (Might, n), (...","[Why, the, Truth, Might, Get, You, Fired, Octo...","Why the Truth Might Get You Fired October 29 ,..."
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1,"[Videos, 15, Civilians, Killed, In, Single, US...","[(Videos, NNP), (15, CD), (Civilians, NNPS), (...","[(Videos, n), (15, n), (Civilians, n), (Killed...","[Videos, 15, Civilians, Killed, In, Single, US...",Videos 15 Civilians Killed In Single US Airstr...
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1,"[Print, An, Iranian, woman, has, been, sentenc...","[(Print, NNP), (An, DT), (Iranian, JJ), (woman...","[(Print, n), (An, n), (Iranian, a), (woman, n)...","[Print, An, Iranian, woman, have, be, sentence...",Print An Iranian woman have be sentence to six...


In [20]:
news_dataset.drop(['pos_tags','wordnet_pos','lemmatized','tokenized'],axis=1,inplace=True)
news_dataset.head()

Unnamed: 0,title,text,label,Article
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide : We Didn ’ t Even See Comey ’ ...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,Ever get the feeling your life circle the roun...
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,"Why the Truth Might Get You Fired October 29 ,..."
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1,Videos 15 Civilians Killed In Single US Airstr...
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1,Print An Iranian woman have be sentence to six...


In [21]:
import string
def review_cleaning(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub("'",'',text)
    text = re.sub('\w*\d\w*', '', text)
    return text

news_dataset['Article']=news_dataset['Article'].apply(lambda x:review_cleaning(x))
news_dataset.head()

Unnamed: 0,title,text,label,Article
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide we didn ’ t even see comey ’ s...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,ever get the feeling your life circle the roun...
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,why the truth might get you fired october t...
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1,videos civilians killed in single us airstrik...
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1,print an iranian woman have be sentence to six...


In [22]:
news_dataset['Article'][0]

'house dem aide  we didn ’ t even see comey ’ s letter until jason chaffetz tweeted it by darrell lucus on october    subscribe jason chaffetz on the stump in american fork  utah  image courtesy michael jolley  available under a creative commonsby license  with apology to keith olbermann  there be no doubt who the worst person in the world be this week–fbi director james comey  but accord to a house democratic aide  it look like we also know who the secondworst person be as well  it turn out that when comey send his nowinfamous letter announce that the fbi be look into email that may be relate to hillary clinton ’ s email server  the ranking democrats on the relevant committee didn ’ t hear about it from comey  they find out via a tweet from one of the republican committee chairman  as we now know  comey notify the republican chairman and democratic rank member of the house intelligence  judiciary  and oversight committee that his agency be review email it have recently discover in ord

In [23]:
#Removing English stopwords 
stop_words = stopwords.words('english')
stop_words

news_dataset['Article'] = news_dataset['Article'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
news_dataset.head()

Unnamed: 0,title,text,label,Article
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide ’ even see comey ’ letter jason...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,ever get feeling life circle roundabout rather...
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,truth might get fired october tension intellig...
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1,videos civilians killed single us airstrike id...
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1,print iranian woman sentence six year prison i...


In [24]:
#Removing additional whitespaces 
news_dataset['Article'] = news_dataset['Article'].replace(r'\s+', ' ', regex=True)
news_dataset.head()

Unnamed: 0,title,text,label,Article
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide ’ even see comey ’ letter jason...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,ever get feeling life circle roundabout rather...
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,truth might get fired october tension intellig...
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1,videos civilians killed single us airstrike id...
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1,print iranian woman sentence six year prison i...


In [25]:
X = news_dataset['Article'].values
Y = news_dataset['label'].values

In [26]:
print(X)

['house dem aide ’ even see comey ’ letter jason chaffetz tweeted darrell lucus october subscribe jason chaffetz stump american fork utah image courtesy michael jolley available creative commonsby license apology keith olbermann doubt worst person world week–fbi director james comey accord house democratic aide look like also know secondworst person well turn comey send nowinfamous letter announce fbi look email may relate hillary clinton ’ email server ranking democrats relevant committee ’ hear comey find via tweet one republican committee chairman know comey notify republican chairman democratic rank member house intelligence judiciary oversight committee agency review email recently discover order see contain classified information long letter go oversight committee chairman jason chaffetz set political world ablaze tweet fbi dir inform fbi learn existence email appear pertinent investigation case reopen — jason chaffetz jasoninthehouse october course know case comey actually say r

In [27]:
print(Y)

[1 0 1 ... 0 1 1]


In [28]:
Y.shape

(20800,)

In [29]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer(max_features = 1500)
vectorizer.fit(X)

X = vectorizer.transform(X)

In [30]:
print(X)

  (0, 1491)	0.017688251468466577
  (0, 1486)	0.08029569671810706
  (0, 1484)	0.06869702021866626
  (0, 1482)	0.020775183033757076
  (0, 1463)	0.022358081761708996
  (0, 1461)	0.05295823158839572
  (0, 1459)	0.023433044861520216
  (0, 1454)	0.021454408481328693
  (0, 1441)	0.043771561434803825
  (0, 1439)	0.028954845379811692
  (0, 1425)	0.039196567310925794
  (0, 1411)	0.03162246917829162
  (0, 1403)	0.02032667815880359
  (0, 1402)	0.08011145526983521
  (0, 1401)	0.16664393265207916
  (0, 1399)	0.13103744449583407
  (0, 1365)	0.03601492414102577
  (0, 1352)	0.02173499118735097
  (0, 1351)	0.024390808601564885
  (0, 1341)	0.03260900123573793
  (0, 1338)	0.04264633100201892
  (0, 1329)	0.02763389958161895
  (0, 1322)	0.04364387866965886
  (0, 1314)	0.024838575162532156
  (0, 1309)	0.09300918386204753
  :	:
  (20799, 190)	0.03684496726209292
  (20799, 187)	0.02729955583559614
  (20799, 170)	0.0406064363318531
  (20799, 165)	0.0932257620775265
  (20799, 159)	0.03972533935689815
  (20799, 1

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [32]:
model = LogisticRegression()

In [33]:
model.fit(X_train, Y_train)

LogisticRegression()

In [34]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [35]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9524639423076923


In [36]:
X_test

<4160x1500 sparse matrix of type '<class 'numpy.float64'>'
	with 592696 stored elements in Compressed Sparse Row format>

In [37]:
X_new = X_test[3]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[1]
The news is Fake


In [38]:
final_model = LogisticRegression()
final_model.fit(X,Y)

LogisticRegression()

In [41]:
joblib.dump(final_model, 'finalized_model.sav')

['finalized_model.sav']

In [42]:
joblib.dump(vectorizer, 'Tfidf_vectorizer.sav')

['Tfidf_vectorizer.sav']

In [None]:
# pickle.dump(final_model, open('/content/drive/MyDrive/Colab Notebooks/fake_news_fulltrained_logistic_2','wb'))

In [None]:
# pickle.load(open('/content/drive/MyDrive/Colab Notebooks/fake_news_trained', 'rb'))

<simpletransformers.classification.classification_model.ClassificationModel at 0x7f5ab7117430>

In [None]:
# pickle.dump(vectorizer, open("tfidf.pickle", "wb"))