In [1]:

import pandas as pd
news_data = pd.read_csv('news.csv')

In [2]:
print(news_data.shape)
news_data.head()

(6335, 4)


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
#getting labels
labels = news_data.label
labels

0       FAKE
1       FAKE
2       REAL
3       FAKE
4       REAL
        ... 
6330    REAL
6331    FAKE
6332    FAKE
6333    REAL
6334    REAL
Name: label, Length: 6335, dtype: object

In [4]:
#splitting into training datasets and testing datasets
#x will be our text and y will be the labels , (if x is y or not)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(news_data['text'],labels,test_size=0.1,random_state=7)

In [5]:
#TF : Term  frequency , IDF: inverse document frequency
#The TfidfVectorizer converts a collection of raw documents into a matrix of TF-IDF features.
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english',max_df=0.7)

tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

In [6]:
x_test

3534    A day after the candidates squared off in a fi...
6265    VIDEO : FBI SOURCES SAY INDICTMENT LIKELY FOR ...
3123    It's debate season, where social media has bro...
3940    Mitch McConnell has decided to wager the Repub...
2856    Donald Trump, the actual Republican candidate ...
                              ...                        
4623    Trending Articles: Trending Articles: Clinton ...
2632    Louisiana Gov. Bobby Jindal said on Monday he ...
28      If you want a glimpse into a presidential cand...
5282    6 Natural Herbs To Prevent Mental Disorders An...
2805    The presidential campaigns of Texas Sen. Ted C...
Name: text, Length: 634, dtype: object

In [7]:
#PassiveAggressiveClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=50, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [8]:
#Predict on the test set and calculate accuracy
from sklearn.metrics import accuracy_score, confusion_matrix
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 95.43%


In [9]:
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[297,  16],
       [ 13, 308]], dtype=int64)

In [None]:
#test the below data if it is fake or real, :P
testThisdata = { 'id' :['1100111'],
        'title': [ 'Arijit Is The New Evil'],
        'text' : ['Arijit Killed Two U.S. Secretary of State']
        }
df_testThisdata = pd.DataFrame (testThisdata, columns = ['id','title' , 'text'])

print (df_testThisdata)

In [None]:
#Testing if df_testThisdata is real or fake using above model
tfidf_test_actualData = tfidf_vectorizer.transform(df_testThisdata)
y_pred=pac.predict(tfidf_test_actualData)
y_pred

# Code Flow
1.first split the dataset into training and testing dataset
2. split format will be 
    -> x_train,x_test,y_train,y_test
        x_train is the data for which we will train the model.
        y_train is the output for x_train
        we will fit x_train into TfidfVectorizer and store output in tfidf_train variable.
        using PassiveAggressiveClassifier we will fit tfidf_train and y_train.
        Now our model will be ready.
        now we will predict for x_test.
        we will fit x_test into TfidfVectorizer and store output in tfidf_test variable.
        then using PassiveAggressiveClassifier ,we will predict output for tfidf_test and store in y_predict variable.
        then we will compare y_test and y_predict and calculate accuracy.
        *y_test is the output of x_test.*