In [1]:
# Import necessary libraries for data manipulation and machine learning

# Import NumPy for numerical computing
import numpy as np

# Import pandas for data manipulation
import pandas as pd

# Import train_test_split function from scikit-learn
# to split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

# Import TfidfVectorizer from scikit-learn
# to convert text data into numerical features
from sklearn.feature_extraction.text import TfidfVectorizer

# Import LinearSVC from scikit-learn
# to train the linear support vector classifier
from sklearn.svm import LinearSVC

In [2]:
data = pd.read_csv("fake_or_real_news.csv")

In [3]:
data

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [4]:
# Add a new column 'fake' to the DataFrame based on the 'label' column
# If the label is "REAL", assign 0 to 'fake'; otherwise, assign 1
data['fake'] = data['label'].apply(lambda x: 0 if x == "REAL" else 1)

# Drop the 'label' column from the DataFrame
# as it is no longer needed for further processing
data = data.drop("label", axis=1)


In [5]:
# Check the column names in the DataFrame
print(data.columns)


Index(['Unnamed: 0', 'title', 'text', 'fake'], dtype='object')


In [6]:
# Split the data into feature (X) and target (y) variables
# X contains the 'text' column which represents the news article text
# y contains the 'fake' column which represents the label indicating whether the news is fake (1) or real (0)
X, y = data["text"], data["fake"]

# Split the data into training and testing sets using train_test_split function from scikit-learn
# X_train and y_train will contain the training data, while X_test and y_test will contain the testing data
# The test_size parameter specifies the proportion of the dataset to include in the testing set (here 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

TF-IDF, which stands for Term Frequency - Inverse Document Frequency, is a metric used to measure the importance of a word in a document within a collection. It evaluates the significance of each word based on two factors: the frequency of the word within the document and its occurrence across all documents in the collection.

TF, or Term Frequency, represents the number of times a specific term appears in a document. It quantifies how frequently a word occurs within a document.

IDF, or Inverse Document Frequency, is the logarithm of the total number of documents divided by the number of documents that contain the term. It determines the rarity of a word by considering how widespread or common it is across the entire collection.

TF-IDF is calculated by multiplying TF and IDF together. It allows us to identify the most relevant and distinctive words in a document by giving higher weights to terms that are both frequently occurring within the document and relatively rare across the collection.

In summary, TF-IDF is a powerful technique that helps us identify important words specific to each document by considering their frequency within the document and their distribution across the entire collection.

In [7]:
# Create an instance of TfidfVectorizer
# with 'english' stop words and a maximum document frequency of 0.7
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Convert the training data (X_train) into a vectorized representation
# using the fit_transform() method of the vectorizer
X_train_vectorized = vectorizer.fit_transform(X_train)

# Convert the testing data (X_test) into a vectorized representation
# using the transform() method of the vectorizer
X_test_vectorized = vectorizer.transform(X_test)

In [8]:
# Create an instance of LinearSVC classifier
# Linear SVC is a popular text classification algorithm known for its effectiveness
clf = LinearSVC()

# Fit the classifier to the vectorized training data and corresponding labels
clf.fit(X_train_vectorized, y_train)

LinearSVC()

In [9]:
# Calculate the accuracy score of the trained classifier on the vectorized testing data and corresponding labels
accuracy = clf.score(X_test_vectorized, y_test)

In [10]:
# Select the article text at index 10 from the testing data
article_text = X_test.iloc[10]

# Transform the selected article text into a vectorized representation
# using the transform() method of the vectorizer
vectorized_text = vectorizer.transform([article_text])

In [11]:
clf.predict(vectorized_text)

array([1], dtype=int64)

In [12]:
y_test.iloc[10]

1