<a href="https://colab.research.google.com/github/ashlyn-viereck/DataScience_Capstone/blob/main/DataScienceProj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
import string

#All my libraries

In [None]:

df = pd.read_csv("WELFake_Dataset.csv", on_bad_lines='skip')

print(df.head())
print(df.info())
print(df.iloc[0])
df['combined'] = df['title'].astype(str) + " " + df['text'].astype(str)

#Loading my dataset and combinging the title and article text


   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
1     Did they post their votes for Hillary already?      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null

In [None]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['combined'].apply(preprocess_text)

#Preprocessing the data-- removing stop words, making it all lowercase, and removing punctuation


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'],
    test_size=0.2, random_state=42, stratify=df['label'])

#Splitting into training and testing

In [None]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

#TF-IDF vectorization

In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)


#Training my Naive Bayes model

In [None]:
y_pred = nb_model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

#Eval


Accuracy: 0.8566576557842933

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85      7006
           1       0.85      0.88      0.86      7421

    accuracy                           0.86     14427
   macro avg       0.86      0.86      0.86     14427
weighted avg       0.86      0.86      0.86     14427


Confusion Matrix:
 [[5860 1146]
 [ 922 6499]]


LSTM

 + potential “retrieval / evidence-based” component

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
df = pd.read_csv("WELFake_Dataset.csv")
df["content"] = df["title"].astype(str) + " " + df["text"].astype(str)
df = df.dropna(subset=["content", "label"])

texts = df["content"].astype(str)
labels = df["label"]

In [None]:
le = LabelEncoder()
labels = le.fit_transform(labels)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [None]:
max_words = 20000
max_len = 300

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding="post")

In [None]:
model = Sequential([
    Embedding(input_dim=max_words, output_dim=100, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])



In [None]:
model.compile(
    loss="binary_crossentropy",optimizer=Adam(1e-3),metrics=["accuracy"])

history = model.fit(
    X_train_pad, y_train,validation_split=0.2, epochs=4,batch_size=64,verbose=1)


Epoch 1/4
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m541s[0m 743ms/step - accuracy: 0.7472 - loss: 0.4672 - val_accuracy: 0.9215 - val_loss: 0.2204
Epoch 2/4
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m536s[0m 742ms/step - accuracy: 0.8776 - loss: 0.2741 - val_accuracy: 0.8063 - val_loss: 0.3458
Epoch 3/4
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m559s[0m 737ms/step - accuracy: 0.8714 - loss: 0.2962 - val_accuracy: 0.9438 - val_loss: 0.1796
Epoch 4/4
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m531s[0m 735ms/step - accuracy: 0.9605 - loss: 0.1292 - val_accuracy: 0.9546 - val_loss: 0.1332


In [None]:
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print(f"Test Accuracy: {accuracy:.4f}")