<a href="https://colab.research.google.com/github/akhilreddy2524/Fake_News_Detection_Using_Linguistics_and_Semantic_Analysis/blob/main/SMM_Fake_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.metrics import confusion_matrix
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags

In [None]:
df1 = pd.read_csv("/N/u/akvajral/Carbonate/Downloads/smm/Fake.csv")
df2 = pd.read_csv("/N/u/akvajral/Carbonate/Downloads/smm/True.csv")
#df1, df2 = df1[:500], df2[:500]
# Add a column to indicate if the news is fake (1) or true (0)
df1['label'] = 1
df2['label'] = 0

# Concatenate the two datasets
df = pd.concat([df1, df2], ignore_index=True)
# Shuffle the rows of the dataframe
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /N/u/akvajral/Carbonate/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /N/u/akvajral/Carbonate/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /N/u/akvajral/Carbonate/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /N/u/akvajral/Carbonate/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [None]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     /N/u/akvajral/Carbonate/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
# Remove special characters and digits
df_text=df['text']
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'\d+', '', x))

# Tokenization
df['text'] = df['text'].apply(word_tokenize)

In [None]:
# Remove stop words
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Perform part-of-speech tagging
df['pos_tags'] = df['text'].apply(lambda x: pos_tag(x))

# Perform named entity recognition
def ne_recognition(text):
    chunked = ne_chunk(text)
    iob_tagged = tree2conlltags(chunked)
    return iob_tagged

df['ne_tags'] = df['pos_tags'].apply(lambda x: ne_recognition(x))

In [None]:
# Get n-grams
def get_ngrams(row):
    text = row['text']
    ngrams = list(nltk.ngrams(text, 2))
    features = []
    for ngram in ngrams:
        features.append(' '.join(ngram))
    return ' '.join(features)

In [None]:
# Get named entity recognition tags
def get_ner(row):
    ne_tags = row['ne_tags']
    features = []
    for tag in ne_tags:
        features.append(tag[1])
    return ' '.join(features)

In [None]:
# Combine n-grams and named entity recognition tags
def combine_ner_ngrams(row):
    text = row['text']
    ne_tags = row['ne_tags']
    ngrams = list(nltk.ngrams(text, 2))
    features = []
    for tag in ne_tags:
        features.append(tag[1])
    for ngram in ngrams:
        features.append(' '.join(ngram))
    return ' '.join(features)

In [None]:
# Checking with 5 true and fake news each
def checking(fea,la,lp):
  fea=fea.reset_index(drop=True)
  la=la.reset_index(drop=True)
  tr, fa, res = [], [], []
  for i in range(len(la)):
    if la[i]==0:
      if len(fa)<5:
        fa.append(i)
    else:
      if len(tr)<5:
        tr.append(i)
    if len(tr)==5 and len(fa)==5:
      break
  print("False News:\n")
  for i in fa:
    res.append("True" if lp.predict(fea)[i] else "Fake")
  print(res,"\n\n")
  res = []
  print("True News:\n")
  for i in tr:
    res.append("True" if lp.predict(fea)[i] else "Fake")
  print(res,"\n\n")

In [None]:
 # Make predictions on validation, train, and test sets
def pred_eval(x, y, lp):
  y_pred = lp.predict(x)
  print('Accuracy:', accuracy_score(y, y_pred))
  print(classification_report(y, y_pred))
  print(confusion_matrix(y, y_pred),"\n\n")

In [None]:
def split_data(data, labels, test_size=0.2, val_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=test_size, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size/(1-test_size), random_state=random_state)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
def modelling(fea,la):
  # Split dataset into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(fea, la, test_size=0.2, stratify=la, random_state=42)
  X_train1, y_train1 = X_train, y_train
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify =y_train, random_state=42)

  # Create pipeline for logistic regression model
  lr_pipeline = Pipeline([
      ('tfidf', TfidfVectorizer()),
      ('clf', LogisticRegression())
  ])

  # Train the model
  lr_pipeline.fit(X_train, y_train)

  # Evaluate the model
  print("For Validation:\n")
  pred_eval(X_val, y_val, lr_pipeline)
  print("For Training:\n")
  pred_eval(X_train, y_train, lr_pipeline)
  print("For Testing:\n")
  pred_eval(X_test, y_test, lr_pipeline)

  # Check with few known cases from test dataset
  checking(X_test, y_test, lr_pipeline)
  return X_train1, y_train1

In [None]:
df['ngrams'] = df.apply(get_ngrams, axis=1)
ngramsX, ngramsY = modelling(df['ngrams'], df['label'])

For Validation:

Accuracy: 0.9898385300668151
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3427
           1       0.99      0.99      0.99      3757

    accuracy                           0.99      7184
   macro avg       0.99      0.99      0.99      7184
weighted avg       0.99      0.99      0.99      7184

[[3396   31]
 [  42 3715]] 


For Training:

Accuracy: 0.9919259413934711
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     13706
           1       0.99      0.99      0.99     15028

    accuracy                           0.99     28734
   macro avg       0.99      0.99      0.99     28734
weighted avg       0.99      0.99      0.99     28734

[[13612    94]
 [  138 14890]] 


For Testing:

Accuracy: 0.9850779510022272
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      4284
           1       0.99      0.99      0

In [None]:
df['ner'] = df.apply(get_ner, axis=1)
nerX, nerY = modelling(df['ner'], df['label'])

For Validation:

Accuracy: 0.793847438752784
              precision    recall  f1-score   support

           0       0.77      0.80      0.79      3427
           1       0.81      0.78      0.80      3757

    accuracy                           0.79      7184
   macro avg       0.79      0.79      0.79      7184
weighted avg       0.79      0.79      0.79      7184

[[2758  669]
 [ 812 2945]] 


For Training:

Accuracy: 0.7971740794877149
              precision    recall  f1-score   support

           0       0.78      0.81      0.79     13706
           1       0.82      0.79      0.80     15028

    accuracy                           0.80     28734
   macro avg       0.80      0.80      0.80     28734
weighted avg       0.80      0.80      0.80     28734

[[11074  2632]
 [ 3196 11832]] 


For Testing:

Accuracy: 0.7914253897550111
              precision    recall  f1-score   support

           0       0.77      0.81      0.79      4284
           1       0.82      0.78      0.

In [None]:
df['ngrams+ner'] = df.apply(combine_ner_ngrams, axis=1)
combX, combY = modelling(df['ngrams+ner'], df['label'])

For Validation:

Accuracy: 0.9887249443207127
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3427
           1       0.99      0.99      0.99      3757

    accuracy                           0.99      7184
   macro avg       0.99      0.99      0.99      7184
weighted avg       0.99      0.99      0.99      7184

[[3391   36]
 [  45 3712]] 


For Training:

Accuracy: 0.9907078722071414
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     13706
           1       0.99      0.99      0.99     15028

    accuracy                           0.99     28734
   macro avg       0.99      0.99      0.99     28734
weighted avg       0.99      0.99      0.99     28734

[[13603   103]
 [  164 14864]] 


For Testing:

Accuracy: 0.9828507795100223
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      4284
           1       0.99      0.98      0

In [None]:
# Although the results are impressive, it is crucial to validate the
#performance of the model because overfitting may happen. Overfitting
#occurs when the model becomes overly specialized to the training data,
#and its ability to generalize to new, unseen data may be compromised.
#To check if there is any significant decrease in accuracies, we are utilizing
#k-fold cross-validation.
from sklearn.model_selection import KFold

In [None]:
def kfold(X,y,k):
  kf = KFold(n_splits=k, shuffle=True, random_state=42)
  X = np.array(X)
  y = np.array(y)
  scores=[]
  for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    lr_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ])
    lr_pipeline.fit(X_train,y_train)

    # Testing the classifier
    y_pred = lr_pipeline.predict(X_test)#_tfidf)
    scores.append(accuracy_score(y_test,y_pred))
  print("Accuracies:",scores)
  print("Average Accuracy:",np.mean(scores))
    # Printing classification report
    #print(classification_report(y_test, y_pred))

In [None]:
# Kfold for ngrams:
print("Kfold for ngrams:")
k=10
kfold(ngramsX,ngramsY,k)

Kfold for ngrams:
Accuracies: [0.986358574610245, 0.9913697104677061, 0.9860801781737194, 0.9883073496659243, 0.986358574610245, 0.9871937639198218, 0.9894209354120267, 0.9838530066815144, 0.98635477582846, 0.9866332497911445]
Average Accuracy: 0.9871930119160808


In [None]:
# Kfold for NER:
print("Kfold for NER:")
kfold(nerX,nerY,k)

Kfold for NER:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracies: [0.794543429844098, 0.7917594654788419, 0.7900890868596881, 0.8051224944320713, 0.8023385300668151, 0.8073496659242761, 0.7937082405345212, 0.7928730512249443, 0.7956001113895851, 0.7961570593149541]
Average Accuracy: 0.7969541135069795


In [None]:
# Kfold for ngrams + NER:
print("Kfold for ngrams + NER:")
kfold(combX,combY,k)

Kfold for ngrams + NER:
Accuracies: [0.9855233853006682, 0.9891425389755011, 0.986358574610245, 0.9874721603563474, 0.984966592427617, 0.9858017817371938, 0.9888641425389755, 0.9824610244988864, 0.9855193539404066, 0.985240879977722]
Average Accuracy: 0.9861350434363562


In [None]:
# After applying k-fold cross-validation, it was observed that there was
#no significant decrease in accuracy and the average accuracy values were
#comparable to the initial results. Therefore, it can be concluded that the
#model did not experience overfitting.

In [None]:
!pip install textblob

Defaulting to user installation because normal site-packages is not writeable
Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[K     |████████████████████████████████| 636 kB 6.4 MB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.17.1


In [None]:
# Define a function to calculate sentiment polarity using TextBlob
from textblob import TextBlob
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity
# Perform sentiment analysis and create a new 'sentiment' column
df['sentiment'] = df['text'].apply(get_sentiment)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['sentiment'].values.reshape(-1, 1), df['label'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
# Train a logistic regression model on the training set
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


# Evaluate the model
print("For Validation:\n")
pred_eval(X_val, y_val, model)
print("For Training:\n")
pred_eval(X_train, y_train, model)
print("For Testing:\n")
pred_eval(X_test, y_test, model)

Accuracy: 0.5103563474387528
For Validation:

Accuracy: 0.5083518930957683
              precision    recall  f1-score   support

           0       0.31      0.01      0.02      3499
           1       0.51      0.98      0.67      3685

    accuracy                           0.51      7184
   macro avg       0.41      0.50      0.34      7184
weighted avg       0.42      0.51      0.35      7184

[[  28 3471]
 [  61 3624]] 


For Training:

Accuracy: 0.5258926707036959
              precision    recall  f1-score   support

           0       0.39      0.01      0.02     13542
           1       0.53      0.98      0.69     15192

    accuracy                           0.53     28734
   macro avg       0.46      0.50      0.35     28734
weighted avg       0.46      0.53      0.37     28734

[[  149 13393]
 [  230 14962]] 


For Testing:

Accuracy: 0.5103563474387528
              precision    recall  f1-score   support

           0       0.41      0.01      0.02      4376
           

In [None]:
df['sentiment']

0        0.082149
1       -0.078947
2        0.055931
3        0.082727
4        0.130252
           ...   
44893   -0.159386
44894    0.072899
44895   -0.976562
44896    0.053472
44897   -0.262879
Name: sentiment, Length: 44898, dtype: float64