# Importing dependencies

In [30]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [31]:
# !pip install nltk

In [32]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

## Text Preprocessing Libraries

- **`re`**: Used for regular expressions (e.g., cleaning text, removing symbols).
- **`stopwords` (from `nltk.corpus`)**: Removes common, less meaningful words like "is", "the", "and".
- **`PorterStemmer` (from `nltk.stem.porter`)**: Stems words to their base/root form (e.g., "running" → "run").
- **`TfidfVectorizer` (from `sklearn.feature_extraction.text`)**: Converts text into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency).


In [33]:
import nltk  # Imports the Natural Language Toolkit library
nltk.download('stopwords')  # Downloads the list of common stopwords (e.g., "the", "is", "and") used in text preprocessing

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nisch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
print(stopwords.words('english'))  # Prints the list of English stopwords provided by NLTK

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

# Importing Data

In [35]:
# loading the dataset to a pandas DataFrame
df=pd.read_csv("train.csv")

# Data Eyeballing & Data Preprocessing

In [36]:
df.shape 

(20800, 5)

In [37]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1


## About the Dataset

1. **`id`**: Unique identifier for a news article.
2. **`title`**: Title of the news article.
3. **`author`**: Author of the news article.
4. **`text`**: Main content of the article (may be incomplete).
5. **`label`**: Indicates whether the news article is real or fake:
   - `1`: Fake News  
   - `0`: Real News


In [38]:
df.sample(6)

Unnamed: 0,id,title,author,text,label
3938,3938,"Torn Ballet Shoes, and a Life Upended - The Ne...",Tim Arango,ISTANBUL — For the first time in half a yea...,0
10285,10285,"At Penn Station, Rail Mishap Spurs Large and L...",Emma G. Fitzsimmons,It was a relatively minor mishap — several ...,0
14457,14457,"‘Today, He Acted Like a Politician’: Voters’ R...","Lizette Alvarez, Jess Bidgood, Mitch Smith and...",President Trump’s address to a joint session o...,0
9893,9893,Angelina Jolie’s Father Speaks Out Against Ill...,Sean Adl-Tabatabai,Sean Adl-Tabatabai in Entertainment // 0 Com...,1
6564,6564,Donald Trump Starts Summer Push With Crippling...,Nicholas Confessore and Rachel Shorey,Donald J. Trump enters the general election ca...,0
6866,6866,"These Products Make Men Grow Breasts, Get Canc...",Dikran Arakelian (noreply@blogger.com),"Owned by Unilever, the Axe brand includes a ra...",1


In [39]:
# count number of missing values
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [40]:
#handling missing values
df.dropna(inplace=True)   
#instead of dropping rows with null values we can also use df.fillna('', inplace=True) which Replaces all NaN values with an empty string
#dropna or fillna is choosed according to the data available

In [41]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [42]:
df.shape

(18285, 5)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18285 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      18285 non-null  int64 
 1   title   18285 non-null  object
 2   author  18285 non-null  object
 3   text    18285 non-null  object
 4   label   18285 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 857.1+ KB


In [44]:
# merging the author name and news title column
df['content'] = df['author']+' '+df['title']

In [45]:
df.content

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 18285, dtype: object

In [46]:
df.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1,Howard Portnoy Iranian woman jailed for fictio...


### Separating Predictors and Target

In [47]:
X = df.drop(columns='label', axis=1)
y = df['label']

In [48]:
print(X.shape, y.shape)

(18285, 5) (18285,)


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18285 entries, 0 to 20799
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       18285 non-null  int64 
 1   title    18285 non-null  object
 2   author   18285 non-null  object
 3   text     18285 non-null  object
 4   label    18285 non-null  int64 
 5   content  18285 non-null  object
dtypes: int64(2), object(4)
memory usage: 1000.0+ KB


***

___

## Stemming in NLP

- **Stemming** is the process of reducing a word to its **root/base form**.
- It helps group related words with similar meanings by stripping suffixes.
- Useful in text preprocessing to normalize words for analysis.

### Example 1:
- `actor`, `actress`, `acting` → `act`

### Example 2:
- `running`, `ran`, `runs` → `run`


---


In [56]:
# Creating an instance of the PorterStemmer
port_stem = PorterStemmer()

In [57]:
# Define the stemming function to process the content
def stemming(content):
    # Remove any non-alphabetic characters (e.g., numbers, punctuation) from the content using a regular expression
    # ^ --> mean explicit  , numbers and punctuations other than alphabets are replaced with space ,  use "content" column
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)  
    
    # Convert the entire content to lowercase to ensure uniformity during processing
    stemmed_content = stemmed_content.lower()
    
    # Split the content into individual words (tokens)
    stemmed_content = stemmed_content.split()
    
    # Stem each word in the content and remove any stopwords (common words like "the", "is", etc. that don't add much meaning)
    # Using list comprehension to iterate through words, apply stemming, and filter out stopwords
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    
    # Join the stemmed words back into a single string
    stemmed_content = ' '.join(stemmed_content)
    
    # Return the final processed and stemmed content
    return stemmed_content
 

##### Apply the stemming() function to each entry in the `content` column

In [None]:
df['content'] = df['content'].apply(stemming)