In [1]:
import pandas as pd

fake = pd.read_csv("data/raw/Fake.csv")
true = pd.read_csv("data/raw/True.csv")

# Add labels: 1 = Fake, 0 = Real
fake["label"] = 1
true["label"] = 0

# Combine them
data = pd.concat([fake, true], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

# Save unified dataset
data.to_csv("data/raw/fake_news.csv", index=False)

print("✅ Combined dataset saved as data/raw/fake_news.csv")
print(data.head())


✅ Combined dataset saved as data/raw/fake_news.csv
                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      1  
1       April 5, 2017       0  
2  September 27, 2017       0  
3         May 22, 2017      1  
4       June 24, 2016       0  

In [3]:
import pandas as pd

# For Fake News
df1 = pd.read_csv('data/raw/fake_news.csv')
print(df1.columns)
print(df1.head())

# For SMS Spam (tab-separated)
df2 = pd.read_csv('data/raw/SMSSpamCollection', sep='\t', names=['label', 'text'])
print(df2.head())


Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')
                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      1  
1       April 5, 2017       0  
2  September 27, 2017       0  
3         May 22, 2017      1  
4       June 

In [None]:
Step-by-step — Convert SMSSpamCollection text file → usable CSV

In [4]:
import pandas as pd

# Read tab-separated text file (no headers)
sms = pd.read_csv(
    "data/raw/SMSSpamCollection",
    sep='\t',              # tab-separated
    names=['label', 'text'],  # add proper column names
    encoding='latin1'       # prevents Unicode errors
)

print(sms.head())
print(sms['label'].value_counts())


  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
label
ham     4825
spam     747
Name: count, dtype: int64


In [5]:
sms.to_csv("data/raw/sms_spam.csv", index=False)
print("✅ Saved as data/raw/sms_spam.csv")


✅ Saved as data/raw/sms_spam.csv


In [6]:
sms = pd.read_csv("data/raw/sms_spam.csv")
sms['label'] = sms['label'].map({'ham': 0, 'spam': 1})
sms.to_csv("data/raw/sms_spam_unified.csv", index=False)
print("✅ Converted to numeric labels (0=ham, 1=spam)")


✅ Converted to numeric labels (0=ham, 1=spam)


In [7]:
print(sms.head())
print(sms['label'].value_counts())


   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
label
0    4825
1     747
Name: count, dtype: int64


In [None]:
Quick check — confirm both processed raw files exist

In [8]:
import os, pandas as pd

print("Files in data\\raw:", os.listdir("data\\raw"))
print("\nPreview fake_news.csv:")
print(pd.read_csv("data\\raw\\fake_news.csv").head().iloc[:, :5])

print("\nPreview sms file:")
# try common names
for name in ["sms_spam.csv","SMSSpamCollection","sms_spam_unified.csv","sms_spam_clean.csv"]:
    p = os.path.join("data","raw", name)
    if os.path.exists(p):
        print("Reading", p)
        try:
            sms = pd.read_csv(p, encoding='latin1', sep=None, engine='python')
        except Exception:
            sms = pd.read_csv(p, encoding='latin1')
        print(sms.head().iloc[:, :2])
        break
else:
    print("No sms file found in data\\raw. Please check file name.")


Files in data\raw: ['Fake.csv', 'fake_news.csv', 'sms+spam+collection', 'sms_spam.csv', 'sms_spam_unified.csv', 'True.csv']

Preview fake_news.csv:
                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      1  
1       April 5, 2017       0 