In [1]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [2]:
data_fake = pd.read_csv("data/Fake.csv")
data_true = pd.read_csv("data/True.csv")

## Cleaning Data

### Data Fake

In [3]:
data_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
data_fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


In [5]:
# Check Duplicate Data
data_fake.duplicated().sum()

3

In [6]:
# Drop Duplicate Data
data_fake = data_fake.drop_duplicates()

In [7]:
# Reset Index Data
data_fake.reset_index(inplace=True)
data_fake = data_fake.drop(["index"], axis=1)
data_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


### Data True

In [8]:
data_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [9]:
data_true.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [10]:
# Checking Duplicate Data
data_true.duplicated().sum()

206

In [11]:
# Drop Duplicated Data
data_true.drop_duplicates()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"


In [12]:
# Reset Index Data
data_true.reset_index(inplace=True)
data_true = data_true.drop(["index"], axis = 1)
data_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


## Data Manipulation

In [13]:
# Create class variable to distinguish fake and true news
data_fake["class"] = 0
data_true["class"] = 1

In [14]:
# Checking Shape data
data_fake.shape, data_true.shape

((23478, 5), (21417, 5))

In [15]:
# Separete fake data and create fake data manual testing
manual_testing_true = data_fake.tail(50) #50 data from the bottom fake data
for i in range(21416, 21366, -1):
    data_true.drop([i], axis = 0, inplace=True)

In [16]:
# Separete true data and create two data manual testing
manual_testing_fake = data_fake.tail(50) #50 data from the bottom true data
for i in range(23477, 23427, -1):
    data_fake.drop([i], axis = 0, inplace=True)

In [17]:
# Checking Shape data
data_fake.shape, data_true.shape, manual_testing_fake.shape, manual_testing_true.shape

((23428, 5), (21367, 5), (50, 5), (50, 5))

In [18]:
# Create class variable to distinguish fake and true news
manual_testing_fake["class"] = 0
manual_testing_true["class"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manual_testing_fake["class"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manual_testing_true["class"] = 1


In [19]:
# Merging Data
merging_data = pd.concat([data_fake, data_true], axis=0)
merging_data.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [20]:
# remove unnecessary data columns
df = merging_data.drop(["title", "subject", "date"], axis=1)

In [21]:
# Suffle Data
df = df.sample(frac = 1)

In [22]:
df.reset_index(inplace=True)
data = df.drop(["index"], axis=1)
data.head()

Unnamed: 0,text,class
0,Tucker to sanctuary city supporter: how did a ...,0
1,(Reuters) - David Betras could see trouble com...,1
2,We already know that the citizens of Harney Co...,0
3,WASHINGTON (Reuters) - The White House said la...,1
4,GET OFF my fracking land!An irate farmer spray...,0


In [23]:
# Create Filtering Text Function to remove unnecessery character in data
def filtering_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) # Remove the characters inside the square brackets
    text = re.sub('https?://\S+|www\.\S+', "", text) # removes all URLs (both those starting with "http" or "https" and those starting with "www")
    text = re.sub('<.*?>+', '', text) # removes all HTML tags (including tags that are HTML markup)
    text = re.sub('\n', "", text) # removes all newline characters
    return text

In [24]:
# Apply Function
data["text"] = data["text"].apply(filtering_text)

In [25]:
# Splitting data into X dan y variable for fitting model
X = data["text"]
y = data["class"]

In [26]:
# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [27]:
# Vectorized text using TF-IDF Algorithm to convert text into number
vectorizer = TfidfVectorizer(stop_words="english")
xv_train = vectorizer.fit_transform(X_train)
xv_test = vectorizer.transform(X_test)

## Model Selection

In [28]:
# Create Function to choose the best model
def acc_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    acc = model.score(x_test, y_test)
    y_pred = model.predict(x_test)
    
    print(f"accuracy : {round(acc*100, 5)} %")
    print(f"Classification Report:\n {classification_report(y_pred, y_test)}")

In [29]:
# Logistic Regression Model
lr = LogisticRegression()
acc_model(lr, xv_train, y_train, xv_test, y_test)

accuracy : 98.35919 %
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      4684
           1       0.98      0.98      0.98      4275

    accuracy                           0.98      8959
   macro avg       0.98      0.98      0.98      8959
weighted avg       0.98      0.98      0.98      8959



In [30]:
# Decision Tree Model
dt = DecisionTreeClassifier()
acc_model(dt, xv_train, y_train, xv_test, y_test)

accuracy : 99.66514 %
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4701
           1       0.99      1.00      1.00      4258

    accuracy                           1.00      8959
   macro avg       1.00      1.00      1.00      8959
weighted avg       1.00      1.00      1.00      8959



In [31]:
# Random Forest Model
rf = RandomForestClassifier()
acc_model(rf, xv_train, y_train, xv_test, y_test)

accuracy : 99.34144 %
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4682
           1       0.99      0.99      0.99      4277

    accuracy                           0.99      8959
   macro avg       0.99      0.99      0.99      8959
weighted avg       0.99      0.99      0.99      8959



## Save Model

In [32]:
def save_model(model):
    joblib.dump(model, f"model/{model}.joblib")

In [33]:
save_model(lr)

In [34]:
save_model(dt)

In [35]:
save_model(rf)

In [36]:
save_model(vectorizer)