In [0]:
import numpy as np 
import pandas as pd 
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier,SGDClassifier,LogisticRegression,LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
import os
import re
BASE_DIR="../input/repository/jayvasantjv-FAKE-NEWS-CLASSIFIER-4df8a36/"
df=pd.read_csv(os.path.join(BASE_DIR,'data.csv'))

<h3>Data is already prepared .i.e string is cleaned to just collection of words</h3>
1 in label means FAKE and 0 means REAL

In [0]:
df.sample(10)

Unnamed: 0,title,text,label
13476,Fellated by a dead pig You may have been a vic...,November 8 2016 Helplines have been jammed wit...,1
17496,Time Channel is a go,Friday 28 October 2016 Space station Time Chan...,1
17185,We are fake news or are we,In an elaborate game of double bluff respectab...,1
20807,Trump Meets With Al Gore on Climate Change Whi...,Donald J Trump and his daughter Ivanka met wi...,0
6782,Die Hard 3 Predicts Hillary Clinton To be Pres...,Hollyjood strikes again Quoting Buck Bundy 731...,1
9086,Feds Once Again Threatening States Over REAL ID,October 26 2016 Feds Once Again Threatening St...,1
3653,Disney CEO Bob Iger Quits Trump Advisory Counc...,Walt Disney Company CEO Robert Iger has quit h...,0
6626,Franken I m Troubled By Sessions Exaggerating ...,Tuesday on MSNBC s For the Record with Greta w...,0
11968,Steve King Hurling Insults at Immigrants Is Re...,WASHINGTON Long before Donald J Trump took aim...,0
4081,Donald Trump Is Elected President in Stunning ...,Donald John Trump was elected the 45th preside...,0


<h3>Converting objects which are actually string to string. <h3>

In [0]:
df['title'] = df['title'].astype('str') 
df['text'] = df['text'].astype('str') 

<h3>Creating a new column content =title+text<h3>

In [0]:
df['content']=df['title']+df['text']

<h3>Preparing Independent and Dependent Variables and splitting for testing and training </h3>

In [0]:
y = df['label']
X=df['content']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=53)

<h3> Testing multiple classifiers to find which will work well . Using Pipleline to define workflows</h3>

<h3>Passive Agressive Classifier</h3>

In [0]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', PassiveAggressiveClassifier(max_iter=50,tol=1e-3)),
              ])
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.9535769719204177


<h3>SGD Classifier</h3>

In [0]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))



accuracy 0.9088471849865952


<h3> Logistic Regression</h3>

In [0]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(C=1000)),
               ])
logreg.fit(X_train, y_train)
#logreg = LogisticRegression()
#logreg.fit(tfidf_train, y_train)
#pred = logreg.predict(tfidf_test)
y_pred = logreg.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))



accuracy 0.9540002822068576


<h3> XGBoost</h3>

In [0]:
xgb = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', XGBClassifier()),
               ])
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.9333991816001129


<h3> Logistic Regression seems to be working well.<br>
In general, Logistic Regression works well for binary output</h3>
<h3>Now lets try to predict from data outside of our prepared dataset<br>
Each string in `check_test` is a news copy pasted from searching  from real news and fake news mixed sources. </h3>

In [0]:

#Function to format strings i.e remove special characters and punctuations to create a string of just space separated words
def format_string(s):
    return re.sub(r'\W+', ' ', s)


news_list=[
    
    #Fake-1
    format_string('''New Delhi: In a move likely to see ruling BJP sweep urban areas, the party has promised double digit increment for everyone working in the IT sector. Double digit increment is generally regarded as a Unicorn or aliens in Indian IT sector, 
    something that has been heard about often but never seen, but BJP has promised that they will make it a reality.
Narendra Modi
Speaking to the press, party spokesperson Sambit Patra said, “Back in 2007, Modi Ji had met an IT Engineer, at that time, he told Modi Ji that his annual salary is 3 lakhs 20 thousands, then Modi Ji met him again recently and asked about his salary and he told Modi Ji that his annual package now is 3 Lakhs 50 thousands. This really shocked Modi Ji and he came to know about this problem of low increments in the IT sector.”

“Now because of code of conduct, we can’t do anything but if BJP is voted to power, each and everyone working in the IT sector will be given double digit increments. The chowkidars employed by these organisations may even get triple digit increments. We are not doing it looking at the elections, we genuinely feel that this is a pain point that needs to be addressed”, Mr. Patra said.

As per sources, BJP has sent letters to all the HR managers working in IT sector, asking them to halt their preparations for Diwali, and come to Delhi for a meeting where the details of this increment scheme can be thrashed out.

Meanwhile, Congress has dismissed this promise by BJP as an election gimmick and Party President Rahul Gandhi has said that Congress will bring in a “Right to On-Site” if they form the government after the elections.'''),
   
     #Real-0
    format_string('''Eight months after his breakthrough meeting with North Korean leader Kim Jong-un, Mr. Trump this week heads into the second denuclearization summit under greater scrutiny to bring back more tangible results from the high-stakes parley in Vietnam.
                    Some analysts expect an agreement that would require North Korea to freeze its production of fissile material at its Yongbyon Nuclear Research Center, used to make nuclear weapons. It’s not clear what Mr. Trump is willing to give in return.
                    Their first meeting in Singapore produced vague promises from Mr. Kim to abandon his weapons programs, but there has been scant progress on that front since then despite months of follow-up talks between the two sides.
                    The U.S. “didn’t really get anything out of that Singapore summit, just an aspirational statement,” said Sue Mi Terry, a former CIA analyst and chair of the Korea program at the Center for Strategic and International Studies. “You can argue that not much has really changed.'''),
    
   #Real-0
    format_string('''NEW DELHI: The Supreme Court on Tuesday declined an urgent hearing of Patidar leader Hardik Patel's plea seeking suspension of his conviction in a 2015 case
relating to rioting.
The matter was mentioned for urgent listing before a bench headed by Justice Arun Mishra.
The bench also comprising justices M M Shantanagoudar and Navin Sinha told the counsel appearing for Patel that there was no urgency in hearing the matter as the
high court order was passed in August last year.
Patel, who recently joined the Congress, had moved the court on Monday + challenging the Gujarat high court order rejecting his plea to stay his conviction so that
he could contest the upcoming Lok Sabha elections.
The 25-year-old Patidar leader had started preparations to contest from Jamnagar on a Congress ticket after joining the party on March 12 and the last date for filing
of nominations is April 4.
Polling for 26 Lok Sabha seats in Gujarat will be held on April 23.
The sessions court at Visnagar in Mehsana district had sentenced Patel to two years' imprisonment + for rioting and arson in Visnagar town in 2015 during the
Patidar quota stir which he led.'''),
    
    #Real-0
    format_string('''The National Investigative Agency (NIA) has identified the owner of vehicle has identified the owner of the vehicle, a Maruti Eeco used in the Pulwama terror attack that killed 40 CRPF personnel on February 14.
                    Sajjad Bhat, who is from Anantnag district, has joined Jaish-e-Mohammad (JeM) and his pictures carrying weapons have gone viral on social media since Monday morning. He has also been evading arrest, according to officials.
                    Officials said that NIA has made a significant breakthrough in the investigation into the terror attack on CRPF convoy that took place on February 14 in which 40 jawans were killed when a Jaish suicide bomber rammed the explosive-laden Eeco into one of the buses.'''),
   
    #Fake-1
    format_string('''Farmer and businessman Robert Vadra, who is currently being questioned for his alleged involvement in money laundering, has demanded that he be provided with dumbbells at the Enforcement Directorate office.
Robert Vadra in gym
Vadra skipped the questioning a few days back due to loose motion. His stomach problem however was not due to something he ate. Sources say that Vadra’s gym schedule was in a mess due to the questioning and that led to his loose motions.
While speaking to Faking News Robert said, “I am very particular about my fitness and my style of dressing. Any deviation from my routine and I immediately get loose motions. So I have requested the ED officials to at least provide me with a set of dumbbells.”
Vadra is also keen on teaching ED officials a few exercises to help them shed some weight. “I am ready to help them get back in shape. Some of them have protruding bellies. Just a few exercises and all of them can have a physique like me,” he added while pulling up his undershirt to reveal his six packs.
On earlier occasion, the businessman dressed in a suit, commented on how ED officials lacked style.
It is not yet clear if his wife and new political entrant would accompany him in weight training at the ED office. Congress officials reveal that it is highly likely that she might lift the dumbbells for the camera, but a full workout routine is out of question.
A spokesperson for the Enforcement Directorate denied any knowledge of any such request from Robert Vadra. “Let me be clear, the loose motion that Mr. Vadra is talking about is a direct consequence of the grilling that he went through at our office. In fact even during the questioning, he visited the washroom several times. As for the dumbbells, he is welcome to do push-ups but we wont be providing him with weights'''),
]

check_test=np.array(news_list)

In [0]:
y_pred = logreg.predict(check_test)
y_pred

array([1, 0, 0, 0, 1])

In [0]:
y_pred = xgb.predict(check_test)
y_pred

array([1, 0, 1, 0, 1])

In [0]:
y_pred = nb.predict(check_test)
y_pred

array([1, 0, 0, 0, 1])

In [0]:
filename = 'fakenews_model.pkl'
joblib.dump(logreg, filename)

['fakenews_model.pkl']