# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load Data

In [None]:
#Read the data
df=pd.read_csv("news.csv", engine="python", error_bad_lines=False)
#Get shape and head
df.shape
df.tail

<bound method NDFrame.tail of       Unnamed: 0  ... label
0           8476  ...  FAKE
1          10294  ...  FAKE
2           3608  ...  REAL
3          10142  ...  FAKE
4            875  ...  REAL
...          ...  ...   ...
6330        4490  ...  REAL
6331        8062  ...  FAKE
6332        8622  ...  FAKE
6333        4021  ...  REAL
6334        4330  ...  REAL

[6335 rows x 4 columns]>

# Analyzing Dataset

## max, min, avg number of characters in text instances

In [None]:
max_char_count = 0
min_char_count = 99999999
total = 0
for i in range(len(df)):
  length = len(df.get("text")[i])
  total += length
  if min_char_count > length and length > 1:
    min_wchar_count = length
  elif max_char_count < length:
    max_char_count = length
    
print("minimum char count is " + str(min_char_count) + "\n",
      "maximum char count is " + str(max_char_count) + "\n",
      "avg char count is " + str(int(total / len(df))))

minimum char count is 99999999
 maximum char count is 1
 avg char count is 5062


## max, min, avg number of words in text instances

In [None]:
max_word_count = 0
min_word_count = 99999999
total = 0
count = df['text'].str.split().str.len()
# count.index = count.index.astype(str) + ' words:'
# count.sort_index(inplace=True)
for index in count:
  total += index
  if index > max_word_count:
    max_word_count = index
  elif index < min_word_count and index > 0:
    min_word_count = index

print("minimum word count is " + str(min_word_count) + "\n",
      "maximum word count is " + str(max_word_count) + "\n",
      "avg word count is " + str(int(total / len(df))))

minimum word count is 6
 maximum word count is 7503
 avg word count is 835


# Data Preprocessing

## get the labels

In [None]:
labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

# Split Dataset Into Training and Test Set

In [None]:
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7)

## Tfidf vectorizer

In [None]:
#DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7, max_features=1000)

#DataFlair - Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [None]:
# tfidf_vectorizer.get_feature_names()
tfidf_vectorizer.vocabulary_

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(binary=True, stop_words="english", max_df=0.7, max_features=100)
vec.fit(x_train)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.7, max_features=100, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [None]:
a = pd.DataFrame(vec.transform(x_train).toarray(), columns=sorted(vec.vocabulary_.keys()))
a

Unnamed: 0,10,2016,according,america,american,americans,believe,better,big,called,campaign,candidate,change,clear,clinton,come,country,day,days,democratic,did,does,don,donald,election,end,fact,far,going,good,government,group,help,high,hillary,house,including,john,just,know,...,post,president,presidential,public,really,recent,republican,republicans,right,said,say,saying,says,state,states,support,things,think,time,times,today,told,trump,united,use,used,ve,vote,voters,want,war,washington,way,week,white,won,work,world,year,years
0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,1,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,1,1,0,1,0,0,1,1,0,1,1,0,0,0,1,0,1,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,1,0,1,1,1,1,0,1,1,1,0,0,1,0,0,1,0,1,1,0,1,1,0,1,0,0,1,1,1,1,1
4,1,0,1,0,0,0,1,1,0,0,1,1,0,0,0,1,0,1,0,0,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,1,0,1,1,1,...,0,0,1,0,1,1,1,1,1,1,1,0,1,1,0,0,1,1,1,0,0,1,1,0,0,0,0,1,1,0,0,1,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5063,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5064,0,1,0,1,1,1,0,1,1,0,1,1,0,0,1,1,1,0,0,1,0,1,1,0,1,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,...,0,1,1,0,1,0,1,1,1,1,1,1,1,0,1,0,1,1,1,0,0,0,0,1,1,0,0,1,1,1,0,0,1,0,1,1,0,0,1,1
5065,1,0,1,1,0,0,1,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,1,1,1,0,1,0,1,1,0,0,0,1,1,1,1,0,1,1,...,1,0,0,0,1,1,0,1,1,0,1,1,0,1,0,0,1,1,1,1,0,0,1,0,0,0,1,0,0,1,0,1,0,1,1,1,0,0,0,1
5066,0,0,0,1,1,0,0,1,1,0,1,1,1,0,0,0,1,1,1,1,0,0,0,0,0,1,1,0,1,1,1,1,1,1,0,1,1,1,1,0,...,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,0,1,0,1,1,0,1,0,0,1,0,0,1,1,0,1,1,1,1,1


In [None]:
a.to_excel("data.xlsx")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(stop_words="english", max_df=0.7, max_features=100)
vec.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.7, max_features=100,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
b = pd.DataFrame(vec.transform(x_train).toarray(), columns=sorted(vec.vocabulary_.keys()))

In [None]:
b.to_excel("idf.xlsx")