In [12]:
import pandas as pd

df = pd.read_csv('WELFake_Dataset.csv')
print(df.head())
print(df.columns)
print(df.isnull().sum())
print(df['label'].value_counts())  # 0 = Real, 1 = Fake


   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
1     Did they post their votes for Hillary already?      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1  
Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')
Unnamed: 0      0
title         558
text           39
label           0
dtype: int64
label
1    37106
0    35028
Name: count, dtype: int64


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


In [15]:
def clean_text(text):
    if not isinstance(text, str):
        return ""  # or return np.nan if you want to keep it null
    text = text.lower()  # Lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove [text]
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # URLs
    text = re.sub(r'\@w+|\#', '', text)  # @mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Punctuation
    text = re.sub(r'\d+', '', text)  # Numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Whitespace
    return text


In [16]:
df['text'] = df['title'].fillna('') + ' ' + df['text'].fillna('')  # Combine and handle NaNs
df['text'] = df['text'].apply(clean_text)


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X = tfidf.fit_transform(df['text'])
y = df['label']


In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [51]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


In [52]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = (model.predict_proba(X_test)[:, 1] > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9575795383655645
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      7089
           1       0.95      0.97      0.96      7338

    accuracy                           0.96     14427
   macro avg       0.96      0.96      0.96     14427
weighted avg       0.96      0.96      0.96     14427



In [56]:


def predict_news(text):
    cleaned = clean_text(text)
    vector = tfidf.transform([cleaned])
    prob = model.predict_proba(vector)[0][1]
    pred = 1 if prob > 0.5 else 0
    return "Fake News" if pred == 1 else "Real News"


In [57]:
import numpy as np
unique, counts = np.unique(model.predict(X_test), return_counts=True)
print(dict(zip(unique, counts)))


{0: 6981, 1: 7446}


In [58]:
example_text = """
Breaking: A new law has been passed that allows the use of cloned animals in military operations, says government report.
"""
print(predict_news(example_text))


Fake News


In [69]:
text = """
NASA has successfully deployed the James Webb Space Telescope into its final orbit, marking a new era in space exploration and astrophysics.
"""


print(predict_news(text))


Fake News
