# Import Libraries

In [16]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load Data

In [17]:
#Read the data
df=pd.read_csv("news.csv", engine="python", error_bad_lines=False)
#Get shape and head
df.shape
df.head()

Skipping line 405: unexpected end of data


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


# Analyzing Dataset

## max, min, avg number of characters in text instances

In [18]:
max_char_count = 0
min_char_count = 99999999
total = 0
for i in range(len(df)):
  length = len(df.get("text")[i])
  total += length
  if min_char_count > length and length > 1:
    min_wchar_count = length
  elif max_char_count < length:
    max_char_count = length
    
print("minimum char count is " + str(min_char_count) + "\n",
      "maximum char count is " + str(max_char_count) + "\n",
      "avg char count is " + str(int(total / len(df))))

minimum char count is 99999999
 maximum char count is 1
 avg char count is 5062


## max, min, avg number of words in text instances

In [19]:
max_word_count = 0
min_word_count = 99999999
total = 0
count = df['text'].str.split().str.len()
# count.index = count.index.astype(str) + ' words:'
# count.sort_index(inplace=True)
for index in count:
  total += index
  if index > max_word_count:
    max_word_count = index
  elif index < min_word_count and index > 0:
    min_word_count = index

print("minimum word count is " + str(min_word_count) + "\n",
      "maximum word count is " + str(max_word_count) + "\n",
      "avg word count is " + str(int(total / len(df))))

minimum word count is 6
 maximum word count is 7503
 avg word count is 835


# Data Preprocessing

## get the labels

In [20]:
labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

# Split Dataset Into Training and Test Set

In [21]:
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7)

## Tfidf vectorizer

In [32]:
#DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7, max_features=1000)

#DataFlair - Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [33]:
# tfidf_vectorizer.get_feature_names()
tfidf_vectorizer.vocabulary_

{'include': 425,
 'based': 94,
 'reported': 746,
 'lead': 497,
 'department': 245,
 'president': 685,
 'barack': 92,
 'obama': 612,
 'called': 126,
 'russian': 773,
 'putin': 712,
 'tuesday': 917,
 'violence': 939,
 'ukraine': 921,
 'russia': 772,
 'support': 869,
 'peace': 646,
 'talks': 881,
 'opportunity': 628,
 'france': 345,
 'reach': 722,
 'white': 969,
 'house': 413,
 'said': 775,
 'continues': 201,
 'actions': 39,
 'including': 426,
 'troops': 910,
 'weapons': 959,
 'rise': 763,
 'sanctions': 776,
 'sent': 800,
 'government': 371,
 'nation': 589,
 'planned': 657,
 'wednesday': 961,
 'united': 924,
 'states': 855,
 'previous': 688,
 'agreement': 48,
 'september': 801,
 'largely': 491,
 'new': 601,
 'deal': 231,
 'key': 476,
 'points': 662,
 'international': 440,
 'border': 110,
 'spokesman': 843,
 'important': 423,
 'sides': 816,
 'come': 178,
 'ready': 725,
 'just': 472,
 'make': 538,
 'live': 518,
 'spoke': 842,
 'life': 512,
 'military': 568,
 'east': 272,
 'killed': 477,
 'p