In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import pickle

# Load dataset (replace with your dataset)
data = pd.read_csv(r"C:\Users\nihar\Desktop\8th Sem\Django_try1\Fake_news_dataset.csv")  # Ensure dataset has 'text' and 'label' columns
X = data['text']
y = data['Class']  # 1 = Fake, 0 = Real

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000)),
    ('classifier', LogisticRegression())
])

# Train Model
pipeline.fit(X_train, y_train)

# Save Model
with open('model.pkl', 'wb') as model_file:
    pickle.dump(pipeline.named_steps['classifier'], model_file)

# Save Vectorizer
with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(pipeline.named_steps['vectorizer'], vectorizer_file)


# Fake news detection

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.pipeline import Pipeline

In [3]:
# # Python program to convert .tsv file to .csv file
# # importing pandas library
# import pandas as pd 

# tsv_file=r"C:\Users\nihar\Downloads\train.tsv"

# # reading given tsv file
# csv_table=pd.read_table(tsv_file,sep='\t')

# # converting tsv file into csv
# csv_table.to_csv(r"C:\Users\nihar\Desktop\8th Sem\Final Project\Datasets\Fake_news_new.csv",index=False)

# # output
# print("Successfully made csv file")


# Label
Fake = 1 , Real = 0

In [4]:
df1 = pd.read_csv(r"C:\Users\nihar\Desktop\8th Sem\Final Project\Datasets\Fake_news.csv")
df1 = df1.drop(['title','subject','date'],axis=1)
df1["label"] = 1
df1.isna().sum()

text     0
label    0
dtype: int64

In [5]:

df2 = pd.read_csv(r"C:\Users\nihar\Desktop\8th Sem\Final Project\Datasets\true_news.csv")
df2 = df2.drop(['title','subject','date'],axis=1)
df2["label"] = 0
df2.isna().sum()

text     0
label    0
dtype: int64

In [6]:
df3 = pd.read_csv(r"C:\Users\nihar\Desktop\8th Sem\Final Project\Datasets\Fake_news_new.csv")
df3 = df3.drop(['Unnamed: 0','title','subject','date'],axis=1)
df3["label"] = df3["label"].apply(lambda x:0 if x==1 else 1)
df3.head()

Unnamed: 0,text,label
0,Former CIA director John Brennan on Friday cri...,0
1,How did this man come to OWN this store? There...,1
2,President Donald Trump on Thursday tapped Fede...,0
3,Hillary Clinton ally David Brock is offering t...,1
4,Pleading ignorance is a perfect ploy for Nancy...,1


In [7]:
df4 = pd.read_csv(r"C:\Users\nihar\Desktop\8th Sem\Final Project\Datasets\train.csv")
df4.rename(columns={"Statement":"text"},inplace=True)
df4["label"] = df4["Label"].apply(lambda x:0 if x=="TRUE" else 1)
df4 = df4.drop("Label",axis=1)
df4.head()

Unnamed: 0,text,label
0,Says the Annies List political group supports ...,1
1,When did the decline of coal start? It started...,1
2,"Hillary Clinton agrees with John McCain ""by vo...",1
3,Health care reform legislation is likely to ma...,1
4,The economic turnaround started at the end of ...,1


In [8]:
df5 = pd.read_csv(r"C:\Users\nihar\Desktop\8th Sem\Final Project\Datasets\news.csv")
df5 = df5[["text","label"]]
df5["label"] = df5["label"].apply(lambda x:1 if x=="FAKE" else 0)
df5.head()

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,U.S. Secretary of State John F. Kerry said Mon...,0
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,It's primary day in New York and front-runners...,0


In [9]:
df_final = pd.concat([df1,df2,df3,df4,df5],axis=0,ignore_index=True)
df_final.head()

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1
2,"On Friday, it was revealed that former Milwauk...",1
3,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis used his annual Christmas Day mes...,1


In [10]:
df_final.shape

(91473, 2)

In [11]:
df_final["label"].value_counts()

label
1    52363
0    39110
Name: count, dtype: int64

In [17]:
df_final

Unnamed: 0,text,label
0,UNITED NATIONS (Reuters) - The United Nations ...,0
1,21st Century Wire says If you re going to dra...,1
2,One has to wonder why in the world people woul...,1
3,The U.S. is the great enemy of mankind! rave...,1
4,Goldman Sachs Group Inc banker James Donovan w...,0
...,...,...
91468,"Share on Twitter \nFor Robin Roberts, losing h...",1
91469,Maury is perhaps one of the trashiest shows on...,1
91470,MOSCOW (Reuters) - Russia s Foreign Ministry s...,0
91471,SANTIAGO (Reuters) - Billionaire conservative ...,0


In [18]:
df_final = df_final.sample(frac=1)
df_final.reset_index(inplace=True)
df_final.drop(['index'],axis=1,inplace=True)
df_final.to_csv(r"C:\Users\nihar\Desktop\8th Sem\Final Project\Datasets\Fake_news_dataset_final.csv",index=False)
df_final.head()

Unnamed: 0,text,label
0,Just like Hillary Stephanopoulos claims it was...,1
1,"For the first time, nearly all of the countrie...",0
2,"Some 20,000 Delphi salaried retirees lost up t...",1
3,How did we ever get to the point where we woul...,1
4,21st Century Wire says Several people have bee...,1


In [20]:
df = pd.read_csv(r"C:\Users\nihar\Desktop\8th Sem\Final Project\Datasets\Fake_news_dataset_final.csv")
print(df.head())
print(df.shape)

                                                text  label
0  Just like Hillary Stephanopoulos claims it was...      1
1  For the first time, nearly all of the countrie...      0
2  Some 20,000 Delphi salaried retirees lost up t...      1
3  How did we ever get to the point where we woul...      1
4  21st Century Wire says Several people have bee...      1
(91473, 2)


In [21]:
import re
import string
def preprocess(text):  
    # Convert to lowercase
    text = text.lower()  
    
    # Remove unwanted patterns
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in square brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', '', text)  # Remove newline characters
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words with numbers
    text = re.sub(r'\W', ' ', text)  # Replace non-word characters with space
    return text

df["text"] = df["text"].apply(preprocess)
