In [None]:
import re

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix
from word_processing import Lemmatizer

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

In [None]:
df = pd.read_csv("../data/cleaned.csv")

In [None]:
df.head()

In [6]:
# # Initialize the CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=3001)

# # Fit the vectorizer and transform the cleaned text
X = vectorizer.fit_transform(df['text'])

# # Get the feature names to understand what words are included
feature_names = vectorizer.get_feature_names_out()

exclude_indices = [i for i, word in enumerate(feature_names) if word in ['reuters']]
X = csr_matrix(np.delete(X.toarray(), exclude_indices, axis=1))

In [7]:
y = np.where(df['Type'] == 'Fake', 0, 1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

## Testing accuracy

In [11]:
# Check accuracy on the test set
accuracy = svm_classifier.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2%}")

# Predict the labels for the test set
y_pred = svm_classifier.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred, target_names=['Fake', 'Real'])
print(report)

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 97.87%
              precision    recall  f1-score   support

        Fake       0.97      0.98      0.98      4733
        Real       0.98      0.97      0.98      4247

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980

Confusion Matrix:
 [[4662   71]
 [ 120 4127]]


## Training accuracy

In [12]:
# Check accuracy on the test set
test_accuracy = svm_classifier.score(X_train, y_train)
print(f"Accuracy: {test_accuracy:.2%}")

# Predict the labels for the test set
y_pred = svm_classifier.predict(X_train)

# Generate a classification report
report = classification_report(y_train, y_pred, target_names=['Fake', 'Real'])
print(report)

# Print confusion matrix
conf_matrix = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 100.00%
              precision    recall  f1-score   support

        Fake       1.00      1.00      1.00     18748
        Real       1.00      1.00      1.00     17170

    accuracy                           1.00     35918
   macro avg       1.00      1.00      1.00     35918
weighted avg       1.00      1.00      1.00     35918

Confusion Matrix:
 [[18748     0]
 [    1 17169]]


In [13]:
coefficients = svm_classifier.coef_.toarray().flatten()  # Flatten to a 1D array for easier access
# Sort the coefficients by their absolute values
top_indices = np.argsort(coefficients)[50:]  # Get the indices of the top 10 words

# Get the corresponding words (features) for those top indices
top_words = feature_names[top_indices]

# Get the corresponding coefficients (weights) for those words
top_weights = coefficients[top_indices]

# Display the top words and their corresponding weights
for word, weight in zip(top_words, top_weights):
    print(f"Word: {word}, Weight: {weight}")

Word: friend, Weight: -0.5916580191364373
Word: taxpayer, Weight: -0.5870444857895719
Word: didn, Weight: -0.5841330387074372
Word: foster, Weight: -0.584016414395181
Word: chaos, Weight: -0.5792751418078012
Word: train, Weight: -0.5754901485731667
Word: campus, Weight: -0.5738164824920291
Word: oh, Weight: -0.5654337141088184
Word: massive, Weight: -0.5610873818830896
Word: gonna, Weight: -0.5584000796385048
Word: washington, Weight: -0.5578841859825592
Word: ed, Weight: -0.5525509133168657
Word: disaster, Weight: -0.5489486542104971
Word: pointed, Weight: -0.5486957553798677
Word: requested, Weight: -0.5462854775404125
Word: funded, Weight: -0.5432733116316673
Word: state, Weight: -0.5431598181392745
Word: removed, Weight: -0.5412970795654108
Word: crazy, Weight: -0.5407760537169036
Word: reached, Weight: -0.5381058318549405
Word: roy, Weight: -0.5341309829607584
Word: offensive, Weight: -0.5334827219564013
Word: suggests, Weight: -0.5328915683480057
Word: aggressive, Weight: -0.5299

In [14]:
df_1 = pd.read_csv('../data/evaluation.csv', sep=';')
df_2 = pd.read_csv('../data/test.csv', sep=';')
df_3 = pd.read_csv('../data/train.csv', sep=';')

new_df = pd.concat([df_1, df_2, df_3], ignore_index=True)
new_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,"Sanders back in U.S. Senate, blasts 'coloniali...",WASHINGTON (Reuters) - Democratic U.S. preside...,1
1,1,Kremlin: Syria peoples' congress being 'active...,MOSCOW (Reuters) - A proposal to convene a con...,1
2,2,Oregon Cop Convicted Of Shattering Biker’s Co...,"In a baffling fit of rage, an Oregon State Pol...",0
3,3,Twitter Erupts With Glee Over #CruzSexScandal...,The last thing any politician running for the ...,0
4,4,MUST WATCH VIDEO: Obama Tries To Trash Trump B...,This is too good to miss! Mr. Teleprompter did...,0


In [164]:
# new_df['cleaned_text'] = new_df['text'].apply(clean_text)
# new_df['cleaned_title'] = new_df['title'].apply(clean_text)
# new_df['truncated_text'] = new_df['cleaned_text'].apply(lambda x: ' '.join(x.split()[:25]))

In [15]:
new_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,"Sanders back in U.S. Senate, blasts 'coloniali...",WASHINGTON (Reuters) - Democratic U.S. preside...,1
1,1,Kremlin: Syria peoples' congress being 'active...,MOSCOW (Reuters) - A proposal to convene a con...,1
2,2,Oregon Cop Convicted Of Shattering Biker’s Co...,"In a baffling fit of rage, an Oregon State Pol...",0
3,3,Twitter Erupts With Glee Over #CruzSexScandal...,The last thing any politician running for the ...,0
4,4,MUST WATCH VIDEO: Obama Tries To Trash Trump B...,This is too good to miss! Mr. Teleprompter did...,0


In [16]:
new_y = new_df['label']

In [19]:
# # Fit the vectorizer and transform the cleaned text
vectorizer = CountVectorizer(stop_words='english', max_features=3000)
X = vectorizer.fit_transform(new_df['text'])

# # Get the feature names to understand what words are included
feature_names = vectorizer.get_feature_names_out()

print(X.shape)

(40587, 3000)


## Model performance on the other dataset

In [20]:
X, _, y, _ = train_test_split(X, new_y, test_size=0.01, random_state=42)
print(X.shape)
print(y.shape)
# Check accuracy on the test set
test_accuracy = svm_classifier.score(X, y)
print(f"Accuracy: {test_accuracy:.2%}")

# Predict the labels for the test set
y_pred = svm_classifier.predict(X)

# Generate a classification report
report = classification_report(y, y_pred, target_names=['Fake', 'Real'])
print(report)

# Print confusion matrix
conf_matrix = confusion_matrix(y, y_pred)
print("Confusion Matrix:\n", conf_matrix)

(40181, 3000)
(40181,)
Accuracy: 52.54%
              precision    recall  f1-score   support

        Fake       0.49      0.88      0.63     18468
        Real       0.68      0.23      0.34     21713

    accuracy                           0.53     40181
   macro avg       0.59      0.55      0.49     40181
weighted avg       0.59      0.53      0.47     40181

Confusion Matrix:
 [[16163  2305]
 [16764  4949]]
