# Importing dependencies

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
# !pip install nltk

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

## Text Preprocessing Libraries

- **`re`**: Used for regular expressions (e.g., cleaning text, removing symbols).
- **`stopwords` (from `nltk.corpus`)**: Removes common, less meaningful words like "is", "the", "and".
- **`PorterStemmer` (from `nltk.stem.porter`)**: Stems words to their base/root form (e.g., "running" → "run").
- **`TfidfVectorizer` (from `sklearn.feature_extraction.text`)**: Converts text into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency).


In [None]:
import nltk  # Imports the Natural Language Toolkit library
nltk.download('stopwords')  # Downloads the list of common stopwords (e.g., "the", "is", "and") used in text preprocessing

In [None]:
print(stopwords.words('english'))  # Prints the list of English stopwords provided by NLTK

# Importing Data

In [None]:
# loading the dataset to a pandas DataFrame
df=pd.read_csv("train.csv")

# Data Eyeballing & Data Preprocessing

In [None]:
df.shape 

In [None]:
df.head()

## About the Dataset

1. **`id`**: Unique identifier for a news article.
2. **`title`**: Title of the news article.
3. **`author`**: Author of the news article.
4. **`text`**: Main content of the article (may be incomplete).
5. **`label`**: Indicates whether the news article is real or fake:
   - `1`: Fake News  
   - `0`: Real News


In [None]:
df.sample(6)

In [None]:
# count number of missing values
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.shape