In [1]:
import pandas as pd
import numpy as np
import math
import re
import nltk
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

In [2]:
dataset = pd.read_csv("news.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
dataset.shape

(6335, 4)

# Cleaning Dataset

In [5]:
dataset.isnull()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
6330,False,False,False,False
6331,False,False,False,False
6332,False,False,False,False
6333,False,False,False,False


In [15]:
dataset.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

# Making Categorical Dataset

In [16]:
dataset.columns = ["Index", "title", "text", "label"]
dataset.drop("Index", axis = "columns")

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [17]:
label = pd.get_dummies(dataset["label"], drop_first = True)
label.head()

Unnamed: 0,REAL
0,0
1,0
2,1
3,0
4,1


In [18]:
dataset = pd.concat([dataset, label], axis = 1)

In [19]:
dataset = dataset.drop(["Index","label"], axis = 1)
dataset.head()

Unnamed: 0,title,text,REAL
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


# Cleaning and preprocessing

1.Regex

In [20]:
#remove punctuations from the string

s = "[]</> hello please ! @ # $ %^&*say()-my +@@@name|\$;.~"

In [21]:
s = re.sub(r"[^\w\s]", "",s)

In [22]:
print(s)

 hello please     saymy name


# 2. Tokenization

In [12]:
#downloading nltk data
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Adarsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [56]:
nltk.word_tokenize("Hello how are you")

['Hello', 'how', 'are', 'you']

# StopWords

In [25]:
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = stopwords.words("english")
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [23]:
sentence = "Wash your hand frequently to stop the spread of noval corona virus"

In [26]:
words = nltk.word_tokenize(sentence)
words = [w for w in words if w not in stop_words]

In [27]:
words

['Wash', 'hand', 'frequently', 'stop', 'spread', 'noval', 'corona', 'virus']

# Lemmatization

In [28]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
input_str = "Root word of studies  is bassically same as study"

[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [29]:
#Tokenize the sentence
input_str = nltk.word_tokenize(input_str)
#print(input_str)

#Lemmatization each word
for words in input_str:
    print(lemmatizer.lemmatize(words))

Root
word
of
study
is
bassically
same
a
study


In [30]:
dataset["Total"] = dataset["title"] + dataset["text"]
dataset = dataset.drop(["title", "text"], axis = 1)
dataset.head(10)

Unnamed: 0,REAL,Total
0,0,"You Can Smell Hillary’s FearDaniel Greenfield,..."
1,0,Watch The Exact Moment Paul Ryan Committed Pol...
2,1,Kerry to go to Paris in gesture of sympathyU.S...
3,0,Bernie supporters on Twitter erupt in anger ag...
4,1,The Battle of New York: Why This Primary Matte...
5,0,"Tehran, USA \nI’m not an immigrant, but my gr..."
6,0,Girl Horrified At What She Watches Boyfriend D...
7,1,‘Britain’s Schindler’ Dies at 106A Czech stock...
8,1,Fact check: Trump and Clinton at the 'commande...
9,1,Iran reportedly makes new push for uranium con...


# Preprocessing the dataset

In [33]:
lemmatizer = WordNetLemmatizer()
for index,row in dataset.iterrows():
    filter_sentence = " "
    
    sentence = row["Total"]
    sentence = re.sub(r"[^\w\s]", "", sentence)#cleaning
    
    words    = nltk.word_tokenize(sentence)#tokensation
    words    = [w for w in words if not w in stop_words]#removing stop words
    
    for word in words:
        filter_sentence = filter_sentence + ' ' + str(lemmatizer.lemmatize(word)).lower()
        
    dataset.loc[index, "Total"] = filter_sentence

In [34]:
dataset.head()

Unnamed: 0,REAL,Total
0,0,smell hillary feardaniel greenfield shillman...
1,0,watch exact moment paul ryan committed polit...
2,1,kerry go paris gesture sympathyus secretary ...
3,0,bernie supporter twitter erupt anger dnc tri...
4,1,battle new york primary mattersits primary d...


In [35]:
x = dataset["Total"]
y = dataset["REAL"]

# Converting text into numeric matrix

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [148]:
cv = CountVectorizer()
mat_train = cv.fit_transform(x_train).todense()  

# TF- iDF Vectorizer

In [39]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
def vectorize_text(features, max_features):
    vectorizer = TfidfVectorizer( stop_words='english',
                            decode_error='strict',
                            analyzer='word',
                            ngram_range=(1, 2),
                            max_features=max_features                   
                            )
    feature_vec = vectorizer.fit_transform(features)
    return feature_vec.toarray()

In [40]:
tfidf_features = vectorize_text(['hello how are you doing','hi i am doing fine'],10)

In [41]:
tfidf_features

array([[0.44943642, 0.        , 0.        , 0.6316672 , 0.6316672 ,
        0.        , 0.        ],
       [0.33517574, 0.47107781, 0.47107781, 0.        , 0.        ,
        0.47107781, 0.47107781]])

# Applying on dataset

In [42]:
#Feature extraction using count vectorization and tfidf.
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(x)
freq_term_matrix = count_vectorizer.transform(x)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)

In [43]:

#split in samples
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(tf_idf_matrix, y, random_state=0)

# Logistic Regression

In [44]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, Y_train)
pred = logreg.predict(X_test)


In [45]:
cm = confusion_matrix(Y_test, pred)
print(cm)
print(accuracy_score(Y_test, pred))

[[715  52]
 [ 55 762]]
0.9324494949494949


# Creating Pipeline 

In [46]:
from sklearn.pipeline import Pipeline
from sklearn import linear_model
import joblib

In [47]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('clf', linear_model.LogisticRegression(C=1e5)),
])

In [48]:
pipeline.fit(x, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(C=100000.0))])

In [49]:
pipeline.predict(["""U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sundayâ€™s unity march against terrorism.

Kerry said he expects to arrive in Paris Thursday evening, as he heads home after a week abroad. He said he will fly to France at the conclusion of a series of meetings scheduled for Thursday in Sofia, Bulgaria. He plans to meet the next day with Foreign Minister Laurent Fabius and President Francois Hollande, then return to Washington.

The visit by Kerry, who has family and childhood ties to the country and speaks fluent French, could address some of the criticism that the United States snubbed France in its darkest hour in many years.

The French press on Monday was filled with questions about why neither President Obama nor Kerry attended Sundayâ€™s march, as about 40 leaders of other nations did. Obama was said to have stayed away because his own security needs can be taxing on a country, and Kerry had prior commitments.

Among roughly 40 leaders who did attend was Israeli Prime Minister Benjamin Netanyahu, no stranger to intense security, who marched beside Hollande through the city streets. The highest ranking U.S. officials attending the march were Jane Hartley, the ambassador to France, and Victoria Nuland, the assistant secretary of state for European affairs. Attorney General Eric H. Holder Jr. was in Paris for meetings with law enforcement officials but did not participate in the march.

Kerry spent Sunday at a business summit hosted by Indiaâ€™s prime minister, Narendra Modi. The United States is eager for India to relax stringent laws that function as barriers to foreign investment and hopes Modiâ€™s government will act to open the huge Indian market for more American businesses.

In a news conference, Kerry brushed aside criticism that the United States had not sent a more senior official to Paris as â€œquibbling a little bit.â€ He noted that many staffers of the American Embassy in Paris attended the march, including the ambassador. He said he had wanted to be present at the march himself but could not because of his prior commitments in India.

â€œBut that is why I am going there on the way home, to make it crystal clear how passionately we feel about the events that have taken place there,â€ he said.

â€œAnd I donâ€™t think the people of France have any doubts about Americaâ€™s understanding of what happened, of our personal sense of loss and our deep commitment to the people of France in this moment of trauma.â€"""])

array([1], dtype=uint8)

In [65]:
#saving pipeline
filename = "pipeline.sav"
joblib.dump(pipeline, filename)

['pipeline.sav']

In [66]:
filename = "./pipline.sav"