In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
import nltk
import spacy
import string
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize


In [3]:
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit

In [4]:
import pickle

In [5]:
# load data
df = pd.read_csv('./Dataset/news_articles.csv')
df.head()

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0
3,Fed Up,2016-11-01T05:22:00.000+02:00,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1.0
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0


In [6]:
def missing_zero_values(df):
    """
    This function is to check the missing values in the dataframe
    """
    sizeUnique = []
    for col in df.columns:
        sizeUnique.append(df[col].nunique())
    mis_val = df.isnull().sum()
    mis_val_percent = round(df.isnull().mean().mul(100),2)
    mz_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mz_table_ren_columns = mz_table.rename(columns = {0 : "Missing Values", 1 : "% of Total Values"})
    mz_table_ren_columns['Unique Values'] = sizeUnique
    mz_table_ren_columns['Data_type'] = df.dtypes
    mz_table_ren_columns = mz_table_ren_columns.sort_values(by = "% of Total Values", ascending = False)
    print("Your selected dataframe has " + str(df.shape[1]) + " columns.\n")
    return mz_table_ren_columns.reset_index()
          

In [7]:
missing_zero_values(df).style.background_gradient(cmap='summer_r')

Your selected dataframe has 12 columns.



Unnamed: 0,index,Missing Values,% of Total Values,Unique Values,Data_type
0,text_without_stopwords,50,2.39,1937,object
1,text,46,2.19,1941,object
2,title_without_stopwords,2,0.1,1780,object
3,language,1,0.05,5,object
4,site_url,1,0.05,68,object
5,main_img_url,1,0.05,1229,object
6,type,1,0.05,8,object
7,label,1,0.05,2,object
8,hasImage,1,0.05,2,float64
9,author,0,0.0,491,object


In [8]:
fig = px.imshow(df.isnull(), labels={"x": "Columns", "y": "Rows", "color": "Number of missing values"})
fig.update_layout(title='Missing Values')
fig.show()

In [9]:
news = df.copy()
news.dropna(subset=['text','title','label'], inplace=True)

# Discover and Visualize Data to gain Insight

In [10]:
# check the distribution of the labels
px.pie(names=news['label'].value_counts().index, 
       values=news['label'].value_counts(), title='Distribution of the labels')

In [11]:
# convert publishedAt to datetime
def split_date(date):
    date = date.split('T')[0]
    return date
news['newsDate'] = news['published'].apply(split_date)
news['newsDate'] = pd.to_datetime(news['newsDate'],errors='coerce' ,format='%Y-%m-%d')

In [12]:
temp = news.groupby([news.newsDate.dt.floor('d'),'label']).size().reset_index(name='count')
px.line(temp,x="newsDate", y="count", color="label", line_group="label")

In [13]:
news['len_text'] = news['text'].apply(len)
news['len_title'] = news['title'].apply(len)

In [14]:
fig = go.Figure()
fig.add_trace(go.Box(x=news['label'], y=news['len_text'], name='Text',marker_color='#3D9970'))
fig.add_trace(go.Box(x=news['label'], y=news['len_title'], name='Title',marker_color='#FF4136'))

fig.update_layout(
    title='Recognize outliers',
    yaxis2=dict(
        title="Len Title",
        titlefont=dict(color="#1f77b4"),tickfont=dict(color="#1f77b4"),
        anchor="x",overlaying="y",side="right",position=0.85),yaxis_title='Len Text',boxmode='group')
fig.show()

In [15]:
px.histogram(news, x="len_text", color='label', marginal="box",title='Distribution of the text length')

In [16]:
temp = news.groupby(['site_url','label']).size().reset_index(name='count')
fig = px.bar(temp,y='site_url',x='count',color='label',orientation='h',height=600)
fig.update_traces(texttemplate='%{y:.2s}',textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

This chart shows us; Five sites published more fake news.

In [17]:
# distribution of special characters
def count_special_characters(text):
    count = 0
    for char in text:
        if char in string.punctuation:
            count += 1
    return count

In [18]:
news['count_special_characters'] = news['text'].apply(count_special_characters)

In [19]:
px.histogram(news, x="count_special_characters", color='label', marginal="box",title='Distribution of the special characters')

Facke news has more punctuation and capitalization than real news.

In [20]:
# word cloud
total_title_fake = ' '.join(news[news.label == 'Fake']['title_without_stopwords'].astype(str))
total_title_real = ' '.join(news[news.label == 'Real']['title_without_stopwords'].astype(str))
word_cloud_real = WordCloud(max_font_size=100,width=800,height=400).generate(total_title_real)
word_cloud_fake = WordCloud(max_font_size=100,width=800,height=400).generate(total_title_fake)

In [21]:
fig = px.imshow(word_cloud_real,color_continuous_scale='gray',title='Real news title')
fig.update_layout(coloraxis_showscale=False)
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)
fig.show()

In [22]:
fig = px.imshow(word_cloud_fake,color_continuous_scale='gray',title='Fake news title')
fig.update_layout(coloraxis_showscale=False)
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)
fig.show()

# Text Preprocessing

In [23]:
# remove missing values
df.isnull().sum()

author                      0
published                   0
title                       0
text                       46
language                    1
site_url                    1
main_img_url                1
type                        1
label                       1
title_without_stopwords     2
text_without_stopwords     50
hasImage                    1
dtype: int64

In [24]:
df.dropna(subset=['text','title','label'], inplace=True)

In [25]:
# Merge two columns
df['total_text'] = df['text'] + ' ' + df['title']

1. Remove all punctuation
    * "-","!",'"',"#","%","&","'","(",")", etc
2. Remove all stopwords
    * words that occur too frequently and not considered informative  
        Examples :  
        {‘the’, ‘a’, ‘an’, ‘and’, ‘but’, ‘for’, ‘on’, ‘in’, ‘at’ …}
3. Perform lemmatization
    * Convert the word or token in its Base form.    
        Examples :    
        Stay, Stays, Staying, Stayed —> Stay    
        House, Houses, Housing —> House  
4. Tokenize news; Returns a list of the cleaned text

In [75]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maili\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [83]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\maili\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


True

In [26]:
def text_preprocessing(text):
    text = text.lower()
    # remove punctuation
    text = [letter for letter in text if letter not in string.punctuation]
    # join the list of characters into a string
    text = ''.join(text)
    # remove stopwords
    text = [word for word in text.split() if word not in stopwords.words('english')]
    # lemmatize
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    return text

In [27]:
df['total_text'] = df['total_text'].apply(text_preprocessing)

In [28]:
for index,row in df['total_text'].iteritems():
    df['total_text'][index] = ' '.join(row)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [30]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
x = vectorizer.fit_transform(df['total_text'])

In [31]:
file = open('vectorizer.pickle', 'wb')
pickle.dump(vectorizer, file)
file.close()

In [32]:
le = LabelEncoder()
y = le.fit_transform(df['label'])

In [33]:
file = open(file = 'label_encoder.pickle', mode = 'wb')
pickle.dump(le, file)
file.close()

# create test and train data

In [34]:
split = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
for train_index, test_index in split.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

In the previous visualization, I found out we have the undistributed label; it's obvious, the test set has to be representative of the overall labels population, so I chose this method to separate my dataset.

# model

In [35]:
# naive bayes
from sklearn.naive_bayes import MultinomialNB
navie_bayes = MultinomialNB().fit(x_train, y_train)
navie_bayes.score(x_test, y_test)

0.6414634146341464

In [36]:
# random forest
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0).fit(x_train, y_train)
random_forest.score(x_test, y_test)

0.6292682926829268

In [37]:
# logistic regression
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression().fit(x_train, y_train)
logistic_regression.score(x_test, y_test)

0.7341463414634146

In [38]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier().fit(x_train, y_train)
decision_tree.score(x_test, y_test)

0.7219512195121951

In [39]:
# gradient boosting
from sklearn.ensemble import GradientBoostingClassifier
gradient_boosting = GradientBoostingClassifier().fit(x_train, y_train)
gradient_boosting.score(x_test, y_test)

0.7585365853658537

In [40]:
# SVM
from sklearn.svm import SVC
svc = SVC(kernel='linear', C=1).fit(x_train, y_train)
svc.score(x_test, y_test)

0.7878048780487805

In [41]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
kneighbors = KNeighborsClassifier(n_neighbors=3).fit(x_train, y_train)
kneighbors.score(x_test, y_test)

0.7463414634146341

In [42]:
# MLP
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1).fit(x_train, y_train)
mlp.score(x_test, y_test)

0.6292682926829268

**SVM** has the highest accuracy, so I'd definitely choose the SVM algorithm 

In [43]:
svc = SVC(kernel='linear', C=1).fit(x, y)
file = open('svc_news_detection.pickle', 'wb')
pickle.dump(svc, file)
file.close()