In [1]:
!pip install pandas




[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# import matplotlib.pyplot as plt
import pandas as pd

---
### Load Dataset
---

In [3]:
df = pd.read_csv('News _dataset\\fake_real_indian_data.csv')

---
### EDA
---

In [4]:
df.head()

Unnamed: 0,label,text
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...
1,FAKE,A four-minute-long video of a woman criticisin...
2,FAKE,"Republic Poll, a fake Twitter account imitatin..."
3,REAL,"Delhi teen finds place on UN green list, turns..."
4,REAL,Delhi: A high-level meeting underway at reside...


In [5]:
df.describe()

Unnamed: 0,label,text
count,3729,3721
unique,2,2229
top,FAKE,Highest IPL score in Dubai\n\nMilestones to wa...
freq,1877,79


In [8]:
df['target'] = df['label'].apply(lambda x: 1 if x == 'REAL' else 0)


---
1. Data shape
---

In [9]:
print(f'The number of columns: = {df.shape[1]}\n The number of rows: = {df.shape[0]}')

The number of columns: = 3
 The number of rows: = 3729


---
### Check duplicates
---

In [10]:
df.duplicated().sum()

np.int64(1498)

---
### drop duplicates
---

In [11]:
df = df.drop_duplicates()

In [12]:
df.duplicated().sum()

np.int64(0)

---
### drop blanks columns
---

In [13]:
# Identify the rows with blank values
blanks = df.isnull().any(axis=1)

# Drop the rows with blank values
df.drop(df[blanks].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[blanks].index, inplace=True)


In [14]:
print(f'The number of columns: = {df.shape[1]}\n The number of rows: = {df.shape[0]}')

The number of columns: = 3
 The number of rows: = 2229


In [15]:
df['target'].value_counts()

target
0    1851
1     378
Name: count, dtype: int64

In [17]:
df.head()

Unnamed: 0,label,text,target
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...,1
1,FAKE,A four-minute-long video of a woman criticisin...,0
2,FAKE,"Republic Poll, a fake Twitter account imitatin...",0
3,REAL,"Delhi teen finds place on UN green list, turns...",1
4,REAL,Delhi: A high-level meeting underway at reside...,1


---
## **Needed only columns one is**
* text
* target
---

In [18]:
df = df[['text', 'target']]

In [19]:
df.head(20)

Unnamed: 0,text,target
0,Payal has accused filmmaker Anurag Kashyap of ...,1
1,A four-minute-long video of a woman criticisin...,0
2,"Republic Poll, a fake Twitter account imitatin...",0
3,"Delhi teen finds place on UN green list, turns...",1
4,Delhi: A high-level meeting underway at reside...,1
5,ROME: Novak Djokovic knows it isn't model beha...,1
6,A viral image showing controversial Islamic pr...,0
7,Several photos are being shared with the misle...,0
8,The driver of the DMU train Arvind Kumar has N...,0
9,An old video of a Dassault Rafale aircraft's s...,0


In [21]:
df['text'][10]

'Ahmedabad: Crime branch sleuths on Friday held a drug peddler Shabbir Shaikh of Jamalpur. He used to sell drugs brought in by Sehjadhussain Tejabwala and Imran Ajmeri from Mumbai. Their supplier, Afaq Bawa was caught by crime branch on Thursday from the border of Maharashtra and Karnataka.On September 12, crime branch had arrested five persons. Three, including an ASI, Firozekhan Nagori, and two others, Mohammed Arif Kazi and Imran Padhiyar were held first from near CTM Crossroads with 995 grams of mephedrone worth Rs 1 crore. The other two, Sehjadhussain Tejabwala and Imran Ajmeri were nabbed from Mumbai. Tejabwala and Ajmeri had bought the mephedrone from Bawa and sent it through Nagori, Kazi and Padhiyar in a car to Ahmedabad.TNN'

In [22]:
df['text'][12]

'Ludhiana: The drain coverage projects have remained a battleground for leaders of the BJP and Congress in central constituency, where from 2004 till 2017 Ganda Nullah coverage project from Gurdwara Dukhniwaran Sahib to Shagun Palace made leaders indulge in credit war. Now, coverage of 1.40 km-long another seasonal drain has started yet another war of words between both parties. Congress MLA from central constituency Surinder Dawar had inaugurated the project on Friday and this led to politics over the project.BJP ex-councillor Inder Aggarwal, whose wife is councillor from ward number 57, rued that the MLA ignored the councillor of the ward at the inauguration and they were not invited for the same. He said if they were in power they would have invited Dawar for the function. However, BJP district president Pushpinder Singal demanded an FIR against Dawar for violating the Covid norms and gathering a large number of people at the spot of inauguration. He asked the Congress leadership wh

---
### **Text Preprocessing**
---

## 3. Data Preprocessing
* Lower case
* Tokenization
* Removing special characters
* Removing stop words and punctuation
* Stemming

---
### Lowercase
---

In [23]:
df['text']=df['text'].str.lower()

In [24]:
df.head()

Unnamed: 0,text,target
0,payal has accused filmmaker anurag kashyap of ...,1
1,a four-minute-long video of a woman criticisin...,0
2,"republic poll, a fake twitter account imitatin...",0
3,"delhi teen finds place on un green list, turns...",1
4,delhi: a high-level meeting underway at reside...,1


---
## Remove Punctuation
---

In [25]:
import string
def remove_pun(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [26]:
df['text'] = df['text'].apply(remove_pun)

In [27]:
df.head()

Unnamed: 0,text,target
0,payal has accused filmmaker anurag kashyap of ...,1
1,a fourminutelong video of a woman criticising ...,0
2,republic poll a fake twitter account imitating...,0
3,delhi teen finds place on un green list turns ...,1
4,delhi a highlevel meeting underway at residenc...,1


---
## remove stopwords
---

In [30]:
import nltk
from nltk.corpus import stopwords

In [31]:
def remove_stopwords(text):
    # Download the stopwords corpus if it hasn't been downloaded already
    nltk.download('stopwords', quiet=True)

    # Get the list of stopwords
    stop_words = set(stopwords.words('english'))

    # Split the text into words
    words = text.split()

    # Remove the stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # Join the filtered words back into a single string
    filtered_text = ' '.join(filtered_words)

    return filtered_text

In [32]:
df['text'] = df['text'].apply(remove_stopwords)

In [33]:
df.head()

Unnamed: 0,text,target
0,payal accused filmmaker anurag kashyap behavin...,1
1,fourminutelong video woman criticising governm...,0
2,republic poll fake twitter account imitating a...,0
3,delhi teen finds place un green list turns gla...,1
4,delhi highlevel meeting underway residence raj...,1


---
## **Tokenization**
---

In [34]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')

def data_processing(text):
    # Remove mentions and hashtags
    text = re.sub(r'\@w+|\#', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text
    text_tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_text = [w for w in text_tokens if w.lower() not in stop_words]

    return " ".join(filtered_text)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\owais\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [35]:
df['text'] = df['text'].apply(data_processing)

In [36]:
df.head()

Unnamed: 0,text,target
0,payal accused filmmaker anurag kashyap behavin...,1
1,fourminutelong video woman criticising governm...,0
2,republic poll fake twitter account imitating a...,0
3,delhi teen finds place un green list turns gla...,1
4,delhi highlevel meeting underway residence raj...,1


In [37]:
df['text'][0]

'payal accused filmmaker anurag kashyap behaving inappropriately video went viral maintained stance speaking etimes said wanted speak long time today finally thought must get head tweeted incident sometime ago metoo movement happened many people told delete tweet else would stop getting work manager advised remove tweet complied post anurag blocked whatsapp'

---
### Stemmer
---

In [39]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data

In [40]:
df['text'] = df['text'].apply(stemming)

In [41]:
df.head()

Unnamed: 0,text,target
0,payal accused filmmaker anurag kashyap behavin...,1
1,fourminutelong video woman criticising governm...,0
2,republic poll fake twitter account imitating a...,0
3,delhi teen finds place un green list turns gla...,1
4,delhi highlevel meeting underway residence raj...,1


---
## **spliting into x and y**
---

In [42]:
X = df['text']
Y = df['target']

---
## **TfidfVectorizer**
---

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
X = vect.fit_transform(df['text'])

---
## **train_test_split**
---

In [44]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [45]:
print("Size of x_train: ", (x_train.shape))
print("Size of y_train: ", (y_train.shape))
print("Size of x_test: ", (x_test.shape))
print("Size of y_test: ", (y_test.shape))

Size of x_train:  (1560, 47611)
Size of y_train:  (1560,)
Size of x_test:  (669, 47611)
Size of y_test:  (669,)


In [46]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

svc = LinearSVC()
svc.fit(x_train, y_train)
svc_pred = svc.predict(x_test)
svc_acc = accuracy_score(y_test, svc_pred)

print("Test accuracy: {:.2f}%".format(svc_acc * 100))


Test accuracy: 99.55%


In [47]:
df['text'][3]

'delhi teen finds place un green list turns glass sand new delhi udit singhal founded startup turn glass bottles sand named among 17 people united nations 2020 class young leaders sustainable development goals sdgs programme flagship initiative recognises efforts young people driving action galvanising others support uns 2030 agenda sustainable developmentsinghal 18yearold lives near mandi house told toi feel proud represent india along 16 others ill work motivate youths achieve sdgs hope able encourage communities inculcate better civic sense create sustainable living spaces teenager attended un function online due covid pandemic normal scenario would gone new york addedsinghal founded glass2sand 2019 zerowaste system turns bottles economically viable sand 2018 found pile bottles home research learnt ragpickers interested collecting glass bottles little demand high transportation cost storage space empty glass bottles often ended landfills know takes million years glass bottle decompo

In [48]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, svc_pred))
print("\n")
print(classification_report(y_test, svc_pred))

[[553   0]
 [  3 113]]


              precision    recall  f1-score   support

           0       0.99      1.00      1.00       553
           1       1.00      0.97      0.99       116

    accuracy                           1.00       669
   macro avg       1.00      0.99      0.99       669
weighted avg       1.00      1.00      1.00       669



In [49]:
df.head()

Unnamed: 0,text,target
0,payal accused filmmaker anurag kashyap behavin...,1
1,fourminutelong video woman criticising governm...,0
2,republic poll fake twitter account imitating a...,0
3,delhi teen finds place un green list turns gla...,1
4,delhi highlevel meeting underway residence raj...,1


In [57]:
df['text'][2]

'republic poll fake twitter account imitating arnab goswamiled republic tv angering netizens controversial polls twitter users including journalists mistake channels official account fake account republicpoll uses logo similar republic tvs logo mention bio whether related republic tv fan account twitters rules require fan account parody account indicate also readdid nita ambani ask support caa siddharth varadarajan founding editor wirein deleted tweet shared screenshot poll fake account claiming poll run nationalistic media house hilarious despite desperate phrasing question poll run nationalist media house ended condemning innocent minority jnu abvp pictwittercomgqgtzfeu26 siddharth svaradarajan january 7 2020 several twitter users pointed account fake varadarajan later clarified parody account told republicpoll parody account kinda makes sense since republic tv parody news channel siddharth svaradarajan january 7 2020 nudge polling via carefully placed words like goons minority jnu d

In [50]:
# Function to predict the news type
def predict_news_type(news_text):
    news_vectorized = vect.transform([news_text])
    prediction = svc.predict(news_vectorized)
    if prediction[0] == 1:
        return "{output = Real News}"
    else:
        return "{output = Fake News}"

# Get user input
user_news = df['text'][3]
# Predict the news type
news_type = predict_news_type(user_news)
print(f"The news is classified as: {news_type}")

The news is classified as: {output = Real News}


## Saving the Model

In [54]:
import joblib

# Saving the model to a file
joblib.dump(svc, 'FND_linear_svc_model_india.pkl')

['FND_linear_svc_model_india.pkl']

In [55]:
joblib.dump(vect, 'FND_vectorizer_india.pkl')

['FND_vectorizer_india.pkl']

In [56]:
# Load the trained model and vectorizer
svcs = joblib.load('FND_linear_svc_model_india.pkl')
vects = joblib.load('FND_vectorizer_india.pkl')

# Function to predict the news type
def predict_news_type(news_text):
    news_vectorized = vects.transform([news_text])
    prediction = svcs.predict(news_vectorized)
    if prediction[0] == 1:
        return "{output = Real News}"
    else:
        return "{output = Fake News}"

# Get user input
user_news = """san francisco reuters california attorney general xavier becerra said friday prepared take whatever action takes defend obamacare mandate health insurers provide birth control trump administration moved circumvent administrations new contraception exemptions another example trump administration trampling peoples rights case women becerra told reuters becerra democratic attorneys general filed courtroom challenges trump administration policies involving healthcare immigration environment"""

# Predict the news type
news_type = predict_news_type(user_news)
print(f"The news is classified as: {news_type}")


The news is classified as: {output = Real News}
