# About Dataset:

         id: unique id for a news article

         title: the title of a news article

         author: author of the news article

         text: the text of the article; could be incomplete.

         label: a label that marks the article as potentially unreliable.

                     

### 1: unreliable

### 0: reliable

# 📚 Importing the Dependences

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer ## root of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
import warnings
#----------------------------------------------------#
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import nltk
warnings.filterwarnings("ignore")
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Printing the stopwords in English 

In [2]:
print(stopwords.words('english'))   # we can use any other lang like arabic

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

# 🛠️ Data Preprocessing

## 📂Read Data

In [3]:
news_dataset = pd.read_csv("/kaggle/input/fake-news/train.csv",index_col="id")

## 📊 Display first 5 rows

In [4]:
news_dataset.head()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


## 📊 Display last 5 rows

In [5]:
news_dataset.tail()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1
20799,What Keeps the F-35 Alive,David Swanson,"David Swanson is an author, activist, journa...",1


## 📊 Display random 5 rows

In [6]:
news_dataset.sample(5)

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8587,"Last Call at Pulse Nightclub, and Then Shots R...",Marc Santora,It was nearing last call. The music was still ...,0
10844,‘We Are Dead Either Way’: Agonizing Choices fo...,Anne Barnard and Hwaida Saad,"BEIRUT, Lebanon — He had long been one of t...",0
12994,Saudi Press : US Blew Up World Trade Center To...,Starkman,The Saudi press is still furious over the U.S....,1
2427,Carbon Tax Reversal in Washington State,Consortiumnews.com,Carbon Tax Reversal in Washington State Novemb...,1
16359,Why Are Russia and China Buying Up All of Amer...,Dave Hodges,Previous Why Are Russia and China Buying Up Al...,1


## 📊 display the shape of dataset

In [7]:
print("the shape of Dataset is: ",news_dataset.shape)

the shape of Dataset is:  (20800, 4)


## 🔍 Some statistical 

In [8]:
news_dataset.describe()

Unnamed: 0,label
count,20800.0
mean,0.500625
std,0.500012
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [9]:
news_dataset["label"].value_counts()

label
1    10413
0    10387
Name: count, dtype: int64

## 📄 Display informations about dataset  

In [10]:
news_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20800 entries, 0 to 20799
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20242 non-null  object
 1   author  18843 non-null  object
 2   text    20761 non-null  object
 3   label   20800 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 812.5+ KB


In [11]:
print("The Number of Null in DataFrame is: ",news_dataset.isnull().sum().sum())
print("The Number of NAN in DataFrame is: ",news_dataset.isna().sum().sum())
print("The Number of duplicated in DataFrame is: ",news_dataset.duplicated().sum().sum())

The Number of Null in DataFrame is:  2554
The Number of NAN in DataFrame is:  2554
The Number of duplicated in DataFrame is:  109


## 🧹 Handling missing Values & duplications 

In [12]:
news_dataset.isnull().sum()

title      558
author    1957
text        39
label        0
dtype: int64

In [13]:
news_dataset = news_dataset.fillna("")

In [14]:
news_dataset.drop_duplicates(inplace=True)


In [15]:
print("The Number of Null in DataFrame is: ",news_dataset.isnull().sum().sum())
print("The Number of NAN in DataFrame is: ",news_dataset.isna().sum().sum())
print("The Number of duplicated in DataFrame is: ",news_dataset.duplicated().sum().sum())

The Number of Null in DataFrame is:  0
The Number of NAN in DataFrame is:  0
The Number of duplicated in DataFrame is:  0


## 🔗 Merging the author name and news title

In [16]:
news_dataset['content']=news_dataset["author"]+" "+news_dataset["title"]

In [17]:
news_dataset['content']

id
0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20691, dtype: object

## ✂️ Separating the data & label


In [18]:
X = news_dataset['content']
Y = news_dataset["label"]

## 🌱 Stremming :
    process of reducing a word to it's root word

In [19]:
stem = PorterStemmer()

In [20]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]'," ",content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = " ".join(stemmed_content)
    return stemmed_content

In [21]:
X = X.apply(stemming)

## 🧮 Convert the textual data to numerical data

In [22]:
vectorizer = TfidfVectorizer()
#vectorizer.fit(X)
X = vectorizer.fit_transform(X)

In [23]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 210058 stored elements and shape (20691, 17128)>
  Coords	Values
  (0, 15686)	0.2842613521824368
  (0, 2483)	0.37143377648420245
  (0, 7692)	0.24756568840535628
  (0, 8630)	0.292252968889811
  (0, 2959)	0.24630456343690857
  (0, 13473)	0.2560366974552351
  (0, 4973)	0.23279389989400243
  (0, 267)	0.26996236127837187
  (0, 3792)	0.26996236127837187
  (0, 7005)	0.2182373518819105
  (0, 8909)	0.36290591552919604
  (0, 3600)	0.35920821107482964
  (1, 1894)	0.15504096987648
  (1, 2223)	0.38267687425953795
  (1, 16799)	0.3006177334151608
  (1, 1497)	0.29420172611711876
  (1, 2813)	0.1913928977262574
  (1, 6816)	0.19078637825470168
  (1, 5503)	0.7141918254203379
  (1, 3568)	0.26361787372618617
  (2, 5389)	0.3869199553865786
  (2, 5968)	0.34753819003893194
  (2, 9620)	0.4934771106665822
  (2, 15611)	0.41536181362013164
  (2, 2943)	0.3178384515035302
  :	:
  (20688, 1287)	0.3354273280782726
  (20688, 13122)	0.24823556285386694
  (206

## 🔀 Splitting the data set into training and test data

In [24]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,stratify=Y,random_state=11)

# 🤖 Trainning The Models

In [25]:
def train(model, model_name):
    model.fit(X_train, Y_train)
    model_train_score = model.score(X_train, Y_train)
    model_test_score = model.score(X_test, Y_test)
    print(f"{model_name} model score on Training data: {model_train_score * 100:0.02f}%\n{model_name} model score on Testing data: {model_test_score * 100:0.02f}%")

In [26]:
log_model = LogisticRegression()
train(log_model, "Logistic Regression")

Logistic Regression model score on Training data: 98.64%
Logistic Regression model score on Testing data: 97.99%


In [27]:
svm_model = SVC()
train(svm_model, 'SVM_classifier')

SVM_classifier model score on Training data: 99.90%
SVM_classifier model score on Testing data: 99.18%


In [28]:
dt_model = DecisionTreeClassifier()
train(dt_model, "Decision_Tree")

Decision_Tree model score on Training data: 100.00%
Decision_Tree model score on Testing data: 99.32%


In [29]:
rf_model = RandomForestClassifier()
train(rf_model, "Random_Forest")

Random_Forest model score on Training data: 100.00%
Random_Forest model score on Testing data: 99.28%


In [30]:
knn = KNeighborsClassifier()
train(knn, "KNN")

KNN model score on Training data: 53.38%
KNN model score on Testing data: 52.21%


# Thank you 🎉 

# Made By : Ali Osama ✍️