<a href="https://colab.research.google.com/github/andresvc21/TFIDFVectorizer/blob/main/Fake_News_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Fake News Detector

Data:

id: unique id for a news article

title: the title of a news article

author: author of the news article

text: the text of the article; could be incomplete

label: a label that marks the article as potentially unreliable
        1: unreliable
        0: reliable

## Preview

In [2]:
#Import Packages

import numpy as np
import pandas as pd
import itertools
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
pd.set_option('display.max_columns', 500)

In [3]:
#Load Dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test = test.set_index('id', drop = True)

## Exploratory Analysis and Preparation

In [4]:
# Counting number of rows and columns in the data
print('---------------------------------------------------------')
print('Shape of Training Data: ', train.shape)
print('---------------------------------------------------------')

# Gettiing a hang of the data in each column and their names
print('---------------------------------------------------------')
print('\n \n TRAIN \n', train.head())
print('---------------------------------------------------------')
print('\n \n TEST \n', test.head())
print('---------------------------------------------------------')

# Looking for any places where training data has NaN values
print('---------------------------------------------------------')
print('\n \nNumber of Null values in Train Set: ', train['text'].isna().sum())
print('---------------------------------------------------------')
print('Number of Null values in Test Set: ', test['text'].isna().sum())
print('---------------------------------------------------------')



---------------------------------------------------------
Shape of Training Data:  (20800, 5)
---------------------------------------------------------
---------------------------------------------------------

 
 TRAIN 
    id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr

In [5]:
# Dropping all rows where text column is NaN
# Drop the rows where the column ‘text’ has null values, and fill a blank space in other columns with null values.
train.dropna(axis=0, how="any", thresh=None, subset=['text'], inplace=True)
test = test.fillna(' ')

In [6]:
# Checking length of each article
length = []
[length.append(len(str(text))) for text in train['text']]
train['length'] = length
print('Minimum Length: ', min(train['length']), '\nMaximum Length: ', max(train['length']), '\nAverage Length: ', round(sum(train['length'])/len(train['length'])))

Minimum Length:  1 
Maximum Length:  142961 
Average Length:  4553


In [7]:
#Articles with one word seems to be an outlier. Let's explore further
print('Number of articles with less than 50 words: ', len(train[train['length'] < 50]))
# Skimming through such short texts just to be sure
print(train['text'][train['length'] < 50])

Number of articles with less than 50 words:  207
82                                                   
169                                                  
173                                   Guest   Guest  
196            They got the heater turned up on high.
295                                                  
                             ...                     
20350                         I hope nobody got hurt!
20418                                 Guest   Guest  
20431    \nOctober 28, 2016 The Mothers by stclair by
20513                                                
20636                              Trump all the way!
Name: text, Length: 207, dtype: object


In [8]:
#207 articles with BS or empty statements. Lets now remove such articles from tje dataset 
train = train.drop(train['text'][train['length'] < 50].index, axis = 0)
print('Minimum Length: ', min(train['length']), '\nMaximum Length: ', max(train['length']), '\nAverage Length: ', round(sum(train['length'])/len(train['length'])))

Minimum Length:  50 
Maximum Length:  142961 
Average Length:  4598


In [9]:
# Secluding labels in a new pandas dataframe for supervised learning
train_labels = train['label']

# Splitting data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(train['text'], train_labels, test_size=0.2, random_state=420)

##TfIdfVectorizer

In [10]:
# Setting up Term Frequency - Inverse Document Frequency Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

# Fit and transform training set and transform test set
tfidf_train = tfidf.fit_transform(x_train) 
tfidf_test = tfidf.transform(x_test)
tfidf_test_final = tfidf.transform(test['text'])



['00',
 '000',
 '0000',
 '000035',
 '0001',
 '000202',
 '000205',
 '0002062',
 '000billion',
 '000c',
 '000emails',
 '000ft',
 '000k',
 '000km',
 '000lb',
 '000m',
 '000s',
 '000th',
 '000â',
 '001',
 '0011',
 '0012',
 '0013',
 '002',
 '0020143',
 '0023z',
 '0024',
 '003',
 '004',
 '004s',
 '005',
 '0054z',
 '005s',
 '006',
 '00684',
 '006s',
 '007',
 '007s',
 '008',
 '008s',
 '009',
 '0099',
 '00am',
 '00o',
 '00p',
 '00pm',
 '00μg',
 '01',
 '010',
 '0101',
 '0102',
 '0107y',
 '011s',
 '012',
 '0128',
 '013',
 '0134',
 '0135y',
 '013c2812c9',
 '015',
 '016e5d9ff252f2444790d05269f4ed90',
 '016s',
 '017',
 '018',
 '019',
 '01915',
 '01am',
 '01pm',
 '01s',
 '02',
 '020',
 '0200gmt',
 '021',
 '023',
 '0235',
 '024',
 '025',
 '026',
 '027',
 '02714',
 '028',
 '02863',
 '02870',
 '029',
 '02pm',
 '02welcome',
 '03',
 '030',
 '031',
 '032',
 '0325',
 '033',
 '0331',
 '033s',
 '034',
 '035',
 '036',
 '03747',
 '039',
 '03am',
 '03eb',
 '03pm',
 '04',
 '040',
 '041',
 '042',
 '043',
 '045',
 