In [3]:
import numpy as np
import pandas as pd

# untuk misahkan data 80%-20%
from sklearn.model_selection import train_test_split
# untuk mengubah text menjadi vector agar dapat dibandingkan dengan nilai fake or real
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC


In [4]:
data = pd.read_csv("fake_or_real_news.csv")

In [5]:
data

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [8]:
data['fake'] = data['label'].apply(lambda x: 0 if x == "REAL" else 1)
# kalau 0 berarti true(fake=false), kalau 1 berarti fake

In [9]:
data

Unnamed: 0,id,title,text,label,fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,0
...,...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL,0
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE,1
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE,1
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL,0


In [10]:
data = data.drop("label", axis=1)

In [11]:
data

Unnamed: 0,id,title,text,fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,0
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,1
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,1
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",0


In [15]:
x, y = data['text'], data['fake']

In [16]:
x

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [17]:
y

0       1
1       1
2       0
3       1
4       0
       ..
6330    0
6331    1
6332    1
6333    0
6334    0
Name: fake, Length: 6335, dtype: int64

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [19]:
x_train

6281    WASHINGTON -- House Republicans are hoping the...
972     Getting 10 Minutes of Sunlight Per Day Can Sto...
4300    To be fair, Clinton has been on the ugly end o...
3184    Republicans have said for years the first lady...
4839    When a prominent, progressive establishment th...
                              ...                        
1547    Print \nIn a story that predictably did not ma...
4648    In a Monday column for the far-right website W...
3404    The Republican National Committee triumphantly...
6062    Killing Obama administration rules, dismantlin...
4285    26 Shares\n21 4 0 1\nA new video purportedly r...
Name: text, Length: 5068, dtype: object

In [20]:
len(x_train)

5068

In [21]:
len(x_test)

1267

In [23]:
# vectorize text

vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

In [25]:
classifier = LinearSVC()
classifier.fit(x_train_vectorized, y_train)

In [29]:
classifier.score(x_test_vectorized, y_test)
#artinya sejumlah ini *100% yang dikategorikan(diclassify) dengan benar

0.9455406471981057

In [31]:
len(y_test) * classifier.score(x_test_vectorized, y_test)

1198.0

In [34]:
with open("fake_text1.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [39]:
vectorized_text = vectorizer.transform([text])

In [40]:
classifier.predict(vectorized_text)
# hasil 1 yang artinya fake, pakai data external

array([1], dtype=int64)

In [41]:
#pakai sebagian data 20% tadi
with open("mytext.txt", "w", encoding="utf-8") as f:
    f.write(x_test.iloc[10])

In [42]:
with open("mytext.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [43]:
vectorized_text = vectorizer.transform([text])

In [49]:
classifier.predict(vectorized_text)
# mengecek apakah hoax atau bukan, 0 bukan hoax, 1 hoax

array([0], dtype=int64)

In [48]:
y_test.iloc[10]
# membuktikan jawaban benar bahwa bukan hoax

0