# DATASET DESCRIPTION

<b>About the Dataset:</b>

1. id: unique id for a news article
2. title: the title of a news article
3. author: author of the news article
4. text: the text of the article; could be incomplete
5. label: a label that marks whether the news article is real or fake:

    `1: Fake News`
    `0: Real News`

In [None]:
# IMPORTING the NECESSARY MODULES

import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer



In [None]:
import nltk
nltk.download('stopwords')

In [None]:
# CHECKING THE STOPWORDS
print(stopwords.words("english"))

## PRE-PROCESSING THE DATA

In [None]:
news_df = pd.read_csv('train.csv')

In [None]:
news_df.head()

In [None]:
news_df.shape

In [None]:
# CHECKING MISSING VALUES

news_df.isnull().sum()

In [None]:
# ADDING EMPTY STRINGS TO NULL VALUES

news_df = news_df.fillna('')

In [None]:
news_df.isnull().sum()

In [None]:
# MERGING OF AUTHOR NAME AND TITLE

news_df['content'] = news_df['author']+' '+news_df['title']

In [None]:
# SEPERATION OF DATA
X = news_df.drop('label', axis=1)
y = news_df['label']

### STEMMING OF DATA:

##### DEFINITION:

Stemming involves reducing words to their base or root form. For example, words like "running," "ran," and "runner" would all be reduced to the stem "run."

In [None]:
port_stem = PorterStemmer()

In [None]:
def stem_data(content):
    stemmed_data = re.sub('[^a-zA-z]', ' ', content)
    stemmed_data = stemmed_data.lower()
    stemmed_data = stemmed_data.split()
    stemmed_data = [port_stem.stem(word) for word in stemmed_data if not word in stopwords.words('english')]
    stemmed_data = ' '.join(stemmed_data)
    
    return stemmed_data
    

In [None]:
news_df['content'] = news_df['content'].apply(stem_data)

In [None]:
news_df['content']

In [None]:
X = news_df['content'].values
y = news_df['label'].values

In [None]:
# CONVERTING THE TEXTUAL DATA TO NUMERICAL DATA

vectoriser = TfidfVectorizer()
vectoriser.fit(X)

X = vectoriser.transform(X)

In [None]:
print(y)

In [None]:
# TRAIN AND TEST DATA
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    stratify= y, random_state=42)

In [None]:
# TRAINING THE MODEL

from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [None]:
# EVALUTATION OF MODEL

from sklearn.metrics import accuracy_score

X_train_pred = lr_model.predict(X_train)
training_data_acc = accuracy_score(X_train_pred, y_train)
training_data_acc

In [None]:
X_test_pred = lr_model.predict(X_test)
test_data_acc = accuracy_score(X_test_pred, y_test)
test_data_acc

### EVALUATING 'test.csv' DATA

In [None]:
test_df = pd.read_csv('test.csv')

In [None]:
test_df.shape

In [None]:
test_df.head()

In [None]:
test_df.isnull().sum()

In [None]:
# PREPROCESSING THE TEST DATA

In [None]:
test_df = test_df.fillna(" ")

In [None]:
test_df['content'] = test_df['author']+ ' ' + test_df['title']

In [None]:
test_df['content']

In [None]:
test_df['content'] = test_df['content'].apply(stem_data)


In [None]:
test_data_final = test_df['content'].values
test_data_final = vectoriser.transform(test_data_final)


In [None]:
# LOADING THE LABELS COLUMNS FOR TEST DATA
test_df_labels = pd.read_csv('submit.csv')

In [None]:
y_val_labels = test_df_labels['label'].values

In [None]:
y_vals_final_pred = lr_model.predict(test_data_final)


In [None]:
test_df_acc_score = accuracy_score(y_val_labels, y_vals_final_pred)