## Import Libraries

In [25]:
import nltk
import re
import os
import string
import numpy as np  #algebra, matrix, linear array
import pandas as pd   #data manipulation, processing and visualization
from nltk.corpus import stopwords #stopwords:and, the, in, to,...
from nltk.stem import PorterStemmer  #lemmatization, Stemming
from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as pyl #for data plotting
from sklearn import linear_model

## Read CSV Files

In [3]:
train_news = pd.read_csv('train.csv')

In [4]:
train_news.head()

Unnamed: 0,label,news
0,False,Says the Annies List political group supports ...
1,True,When did the decline of coal start? It started...
2,True,"Hillary Clinton agrees with John McCain ""by vo..."
3,False,Health care reform legislation is likely to ma...
4,True,The economic turnaround started at the end of ...


In [5]:
test_news = pd.read_csv('test.csv')

In [6]:
test_news.head()

Unnamed: 0,label,news
0,True,Building a wall on the U.S.-Mexico border will...
1,False,Wisconsin is on pace to double the number of l...
2,False,Says John McCain has done nothing to help the ...
3,True,Suzanne Bonamici supports a plan that will cut...
4,False,When asked by a reporter whether hes at the ce...


In [7]:
print(train_news.shape, test_news.shape)

(10240, 2) (1267, 2)


In [8]:
train_news.isnull().sum()

label    0
news     0
dtype: int64

In [9]:
test_news.isnull().sum()

label    0
news     0
dtype: int64

In [10]:
train_news['news'][0]

'Says the Annies List political group supports third-trimester abortions on demand.'

In [11]:
test_news['news'][0]

'Building a wall on the U.S.-Mexico border will take literally years.'

## Prepairing News Content

In [36]:
# Path to the uploaded english stopwords file
stopwords_path = 'english'  # Make sure this matches the uploaded file name

# Load stopwords into a list
with open(stopwords_path, 'r') as file:
    english_stopwords = [line.strip() for line in file]

# Example function to clean sentences using the manually loaded stopwords
def clean_sentence(sentence):
    words = sentence.lower().split()
    cleaned_words = [word for word in words if word not in english_stopwords]
    return ' '.join(cleaned_words)

# Test the cleaning function
sentence = "This is a simple test sentence with some common stopwords."
cleaned_sentence = clean_sentence(sentence)
print(cleaned_sentence)


this is a simple test sentence with some common stopwords.


In [42]:
def clean_news(news):
    ps = PorterStemmer()
    cleaned_content = re.sub('[^a-zA-Z]', " ", news)
    cleaned_content = cleaned_content.lower()
    cleaned_content = cleaned_content.split()
    cleaned_content = [ps.stem(w) for w in cleaned_content if w not in english_stopwords]
    cleaned_content = ' '.join(cleaned_content)
    return cleaned_content


In [43]:
train_news['news'] = train_news['news'].apply(clean_news)

In [44]:
train_news['news']  #lower case and puntuation removed

0        say the anni list polit group support third tr...
1        when did the declin of coal start it start whe...
2        hillari clinton agre with john mccain by vote ...
3        health care reform legisl is like to mandat fr...
4        the econom turnaround start at the end of my term
                               ...                        
10235    there are a larger number of shark attack in f...
10236    democrat have now becom the parti of the atlan...
10237    say an altern to social secur that oper in gal...
10238    on lift the u s cuban embargo and allow travel...
10239    the depart of veteran affair ha a manual out t...
Name: news, Length: 10240, dtype: object

In [45]:
test_news['news'] = test_news['news'].apply(clean_news)

In [46]:
test_news['news']  #lower case and punctuation removed

0       build a wall on the u s mexico border will tak...
1       wisconsin is on pace to doubl the number of la...
2            say john mccain ha done noth to help the vet
3       suzann bonamici support a plan that will cut c...
4       when ask by a report whether he at the center ...
                              ...                        
1262    say hi budget provid the highest state fund le...
1263                       ive been here almost everi day
1264    in the earli s sen edward kennedi secretli off...
1265    say an epa permit languish under strickland bu...
1266    say the governor is go around the state talk a...
Name: news, Length: 1267, dtype: object

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
Tdf = TfidfVectorizer()
Tdf.fit(train_news['news'].values)
x = Tdf.transform(train_news['news'].values)
print(x)

  (0, 10702)	0.40886715072244245
  (0, 10443)	0.2747911994579363
  (0, 10396)	0.06790002498787034
  (0, 10116)	0.2672572426804768
  (0, 9084)	0.11061621231128999
  (0, 7785)	0.2847781345973369
  (0, 7137)	0.13423366374675696
  (0, 6049)	0.3217766731744173
  (0, 4531)	0.29183292077682504
  (0, 2695)	0.33992352847055973
  (0, 461)	0.4270140060185848
  (0, 32)	0.2886723855101126
  (1, 11337)	0.30077717220099753
  (1, 10545)	0.18460823864108933
  (1, 10518)	0.07293424484193078
  (1, 10396)	0.053617915259152965
  (1, 10388)	0.10044556018652273
  (1, 9833)	0.45919411103681057
  (1, 9832)	0.24392689633455358
  (1, 7963)	0.13824235681857364
  (1, 7083)	0.194307313196089
  (1, 7081)	0.07063147834248598
  (1, 6827)	0.25409408019687973
  (1, 5464)	0.12966492702492755
  (1, 5101)	0.06818003910680401
  :	:
  (10239, 11561)	0.16006008675951924
  (10239, 11554)	0.25713551472325497
  (10239, 11063)	0.3687899577513985
  (10239, 11025)	0.23405156762608334
  (10239, 10518)	0.12327538425774356
  (10239, 1

In [20]:
Tdf.fit(test_news['news'].values)
y= Tdf.transform(test_news['news'].values)
print(y)

  (0, 4128)	0.23077271195210955
  (0, 4075)	0.2511990812472193
  (0, 4001)	0.3610861472959094
  (0, 3714)	0.09242464296519347
  (0, 3642)	0.3300752296011
  (0, 2546)	0.18305099777478026
  (0, 2304)	0.3857008974389433
  (0, 2140)	0.3857008974389433
  (0, 469)	0.4277800907422546
  (0, 420)	0.34890394238837386
  (1, 4126)	0.2356320355776743
  (1, 4085)	0.297673075134241
  (1, 3765)	0.12387435380251637
  (1, 3736)	0.24776095660563588
  (1, 3714)	0.0927329768362421
  (1, 2618)	0.453904055831189
  (1, 2546)	0.18366166632520645
  (1, 2517)	0.12123142437736349
  (1, 2499)	0.3353787328844903
  (1, 2070)	0.453904055831189
  (1, 1904)	0.16759795190823457
  (1, 1083)	0.41168448394327556
  (2, 3947)	0.49224226811532495
  (2, 3765)	0.13433718446382087
  (2, 3714)	0.10056550555242068
  :	:
  (1265, 153)	0.16645686200406767
  (1266, 4030)	0.11998041340181102
  (1266, 3714)	0.1020235209077682
  (1266, 3710)	0.09407380025003333
  (1266, 3664)	0.1260893060722549
  (1266, 3651)	0.2264647485066791
  (1266,

In [52]:
model = linear_model.LogisticRegression()
x_label = train_news['label'].values
model.fit(x, x_label)

In [53]:
train_prediction = model.predict(x)
print("train accuracy:", accuracy_score(train_prediction, x_label))

train accuracy: 0.78076171875


In [54]:
model = linear_model.LogisticRegression()
y_label = test_news['label'].values
model.fit(y, y_label)

In [55]:
test_prediction = model.predict(y)
print("test accuracy:", accuracy_score(test_prediction,y_label))

test accuracy: 0.8808208366219415


In [57]:
input_data = y[10]
prediction = model.predict(input_data)
if prediction[0]==1:
    print("Fake News")
else:
    print("real news")

real news
