# Data Exploration

In [21]:
import pandas as pd

df = pd.read_excel("fakenews.xlsx")

print(df.head())


                                            Headline     Stance  \
0  Former British Rapper Reportedly Under Investi...  unrelated   
1       US hostage Luke Somers dies after rescue bid    discuss   
2  Nicaragua asks U.S. for help investigating met...  unrelated   
3  Reports Isis fighters have contracted Ebola ar...    discuss   
4                   ISIS Beheads American Journalist  unrelated   

                                         articleBody  
0  Description: Fake news / Satire\nCirculating s...  
1  SANAA, Dec 6 (Reuters) - U.S. journalist Luke ...  
2  Nikolai Kryaglyachenko, 12, now attracts coins...  
3  Reports that Islamic State militants in Mosul ...  
4  A touching tribute to the victims of the Charl...  


In [22]:
import pandas as pd

df = pd.DataFrame({
    'Headline': ['Former British Rapper Reportedly Under Investigation', 'US hostage Luke Somers dies after rescue bid', ...],
    'Stance': ['unrelated', 'discuss', ...],
    'articleBody': ['Description: Fake news / Satire...', 'SANAA, Dec 6 (Reuters)...', ...]
})

print(df.head())


                                            Headline     Stance  \
0  Former British Rapper Reportedly Under Investi...  unrelated   
1       US hostage Luke Somers dies after rescue bid    discuss   
2                                           Ellipsis   Ellipsis   

                          articleBody  
0  Description: Fake news / Satire...  
1           SANAA, Dec 6 (Reuters)...  
2                            Ellipsis  


# Pre-processing Text Data

In [23]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    if isinstance(text, str):  
        text = re.sub(r'\W', ' ', text)  
        text = re.sub(r'\s+', ' ', text)  
        text = text.lower() 
        text = word_tokenize(text)  
        text = [word for word in text if word not in stopwords.words('english')]  
        return ' '.join(text)
    else:
        return ""  

df['cleaned_headline'] = df['Headline'].apply(preprocess_text)
df['cleaned_body'] = df['articleBody'].apply(preprocess_text)


# Handling Missing or Non-String Data in the Stance Column and Encoding Stance Labels

In [24]:
df['Stance'] = df['Stance'].apply(lambda x: x if isinstance(x, str) else 'unrelated')

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['encoded_stance'] = le.fit_transform(df['Stance'])


# Combining Text Features and Applying TF-IDF Vectorization

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

df['combined'] = df['cleaned_headline'] + " " + df['cleaned_body']

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['combined'])

# Training and Evaluating SVM Model

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['encoded_stance'], test_size=0.2, random_state=42)

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

model_svm = SVC()
model_svm.fit(X_train, y_train)

y_pred = model_svm.predict(X_test)
print("TF-IDF + SVM Accuracy:", accuracy_score(y_test, y_pred))


TF-IDF + SVM Accuracy: 1.0


In [27]:
from collections import Counter

print("Class distribution in the training set:", Counter(y_train))

Class distribution in the training set: Counter({0: 1, 1: 1})


In [28]:
X_train_simple = X_train  
y_train_simple = y_train

model_svm.fit(X_train_simple, y_train_simple)

y_pred_simple = model_svm.predict(X_train_simple)
print("Manual evaluation result:", y_pred_simple == y_train_simple)

Manual evaluation result: 1    True
2    True
Name: encoded_stance, dtype: bool


In [20]:
model_svm.fit(X_train, y_train)

y_pred = model_svm.predict(X_train)

evaluation_results = y_pred == y_train
print("Manual evaluation result:", evaluation_results)



Manual evaluation result: 1    True
2    True
Name: encoded_stance, dtype: bool
