Installations

In [None]:
%pip install spacy
%pip install nltk
%pip install swig==3.0.6
%pip install jamspell
%pip install -q transformers

Library and Package Imports

In [None]:
import pandas as pd
import numpy as np
import spacy
import nltk
from nltk import word_tokenize,sent_tokenize,tokenize
from nltk.corpus import stopwords
import string
from spacy.matcher import Matcher
import jamspell
import locale
from transformers import pipeline
import random
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from transformers import BertForSequenceClassification, BertTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import random
from tensorflow.keras.layers import LSTM, GRU, Dense, Input
from tensorflow.keras.models import Model
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam,SGD
from keras.layers import Bidirectional

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
stop_words = set(stopwords.words('english'))
tokenizer = spacy.load("en_core_web_sm")
sentence_recognizer = spacy.load("en_core_web_sm", exclude=["parser"])
sentence_recognizer.enable_pipe("senter")
corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel("en.bin")
locale.getpreferredencoding = lambda: "UTF-8"

Load dataset and read data into csv

In [None]:
filePath = 'datasetPath'
df = pd.read_csv(train_path)
df.head()

Feature Engineering - Get Average Word Length of each review

In [None]:
def avg_word_len(review):
  tokens = word_tokenize(review)
  words = [word for word in tokens if word.isalpha()]
  tot_no_words = len(words)
  length_of_words = 0
  for word in words:
    length_of_words += len(word)
  return length_of_words/tot_no_words,tot_no_words

Feature Engineering - Get Average Sentence Length of each review

In [None]:
def avg_sentence_len(review):
  sentences = sent_tokenize(review)
  tot_sent_number = len(sentences)
  length_sent = 0
  for sentence in sentences:
    length_sent += len(sentence)
  return length_sent/tot_sent_number , tot_sent_number

Feature Engineering - POS Tagging for each review

In [None]:
def pos_tagging(review):
  tokens = word_tokenize(review)
  tokens = [w.lower() for w in tokens]
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  words = [word for word in stripped if word.isalpha()]
  words = [w for w in words if not w in stop_words]
  tags = nltk.pos_tag(words, tagset = "universal")
  verb_count = 0
  adj_count = 0
  for tag in tags:
    if(tag[1] == 'VERB'):
      verb_count += 1

    if(tag[1] == 'ADJ'):
      adj_count += 1

  return verb_count,adj_count

Feature Engineering - Content Diversity of each review

In [None]:
def content_diversity(review):
  tokens = word_tokenize(review)
  tokens = [w.lower() for w in tokens]
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  words = [word for word in stripped if word.isalpha()]
  words = [w for w in words if not w in stop_words]
  tot_num_words = len(words)
  tot_num_unique_words = len(set(words))
  return tot_num_unique_words / tot_num_words

Feature Engineering - Get Passive Voice Count of each review

In [None]:
def get_NPV_count(review):
  proc_string = tokenizer(review)
  matcher = Matcher(tokenizer.vocab)
  proc_sents = list(proc_string.sents)
  passive_rule = [{'DEP':'nsubjpass'},{'DEP':'aux','OP':'*'},{'DEP':'auxpass'},{'TAG':'VBN'}]
  matcher.add('Passive',[passive_rule])
  matches = matcher(proc_string)
  return len(matches)

Feature Engineering - Get Number of Typos in each review

In [None]:
indian_word_list = ['murg','tikki','gulab','paneer','bhartiya','pakoda', 'aloo', 'palak','naan','hyderabadi','murgh','kulche','korma','momos','kulchas','daal','malai']
def get_NTP(paragraph):
  tokens = word_tokenize(paragraph)
  tokens = [w.lower() for w in tokens]
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  words = [word for word in stripped if word.isalpha()]
  words = [w for w in words if not w in stop_words]
  misspelled_count = 0
  corrected_words = []
  print(words)
  for word in words:
      corrected_word = corrector.FixFragment(word)
      print(corrected_word)
      if word not in indian_word_list and corrected_word != word:
          misspelled_count += 1

          print("in if:",word)
  return misspelled_count

Feature Engineering - Get sentiment of each review using DistilBERT 

In [None]:
def get_sentiment(review):
  sentences = sent_tokenize(review)
  sentiment_pipeline = pipeline("sentiment-analysis")
  sentiment_score = 0
  for sentence in sentences:
    if(sentiment_pipeline(sentence)[0]['label'] =="POSITIVE"):
      sentiment_score += sentiment_pipeline(sentence)[0]['score']
    else:
      sentiment_score -= sentiment_pipeline(sentence)[0]['score']
  return round(sentiment_score,5)

In [None]:
df['SENTIMENT_SCORE'] = [0 for i in range(0,260)]
for ind in df.index:
  df['SENTIMENT_SCORE'][ind] = get_sentiment(df['Review'][ind])
  df['AWL'][ind],df['NOW'][ind] = avg_word_len(df['Review'][ind])
  df['ASL'][ind],df['NST'][ind] = avg_sentence_len(df['Review'][ind])
  df['NVB'][ind],df['NAJ'][ind] = pos_tagging(df['Review'][ind])
  df['CDV'][ind] = content_diversity(df['Review'][ind])
  df['NPV'][ind] = get_NPV_count(df['Review'][ind])
  df['NTP'][ind] = get_NTP(df['Review'][ind])
  df['TPR'][ind] = df['NTP'][ind] / df['NOW'][ind]

Normalize the dataset 

In [None]:
feature_dataset = df.iloc[:,4:]
scaler = preprocessing.MinMaxScaler()
norm = scaler.fit_transform(feature_dataset)
norm_df = pd.DataFrame(norm,columns=[feature_dataset.columns])
target_dataset = df.iloc[:,2:3]
dataframe = pd.concat([norm_df, target_dataset],axis=1)
random.seed(1010)
data = shuffle(dataframe,random_state=43)
print(data.head())

Split the dataset in 60:20:20 

In [None]:
X = data.iloc[:,:11]
y = data.iloc[:,11:]

X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.20,stratify=y,random_state=55)
X_val,X_test,y_val,y_test = train_test_split(X_rest,y_rest,test_size = 0.50,stratify = y_rest, random_state=45)

Model - XGBoost 

In [None]:
estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 300, 10),
    'learning_rate': [0.1, 0.01, 0.05],
    'subsample' : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
    'colsample_bytree' : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
}
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'f1',
    n_jobs = 10,
    cv = 10,
    verbose=True
)
grid_search.fit(X, y)
print(grid_search.best_estimator_)

#Result of above hyperparameter tuning is hard-coded
xgb_model = xgb.XGBClassifier(objective="binary:logistic",n_estimators=180,subsample = 0.6,random_state=42,colsample_bytree=0.7,max_depth=8,learning_rate=0.01)
xgb_model.fit(X_train, y_train)
y_pred_train = xgb_model.predict(X_train)
y_pred_val = xgb_model.predict(X_val)
y_pred_test = xgb_model.predict(X_test)

mse_test = f1_score(y_test,y_pred_test)
mse_val=f1_score(y_val, y_pred_val)
mse_train = f1_score(y_train, y_pred_train)
print(mse_train)
print(mse_val)
print(mse_test)

Model - Logistic Regression

In [None]:
model = LogisticRegression(solver='liblinear',random_state = 45)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)
train_f1 = f1_score(y_train, y_pred_train)
val_f1 = f1_score(y_val, y_pred_val)
test_f1 = f1_score(y_test,y_pred_test)

print(train_f1)
print(val_f1)
print(test_f1)


Model - Random Forest Classifier

In [None]:
param_grid = {
    'n_estimators': [25, 50, 100, 150,],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
    'criterion' : ['gini', 'entropy', 'log_loss'],
}

grid_search = GridSearchCV(RandomForestClassifier(),param_grid=param_grid)
grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_)

#Result of above hyperparameter tuning is hard-coded
rf_clf = RandomForestClassifier(max_depth=6, max_leaf_nodes=9, n_estimators=150, random_state=44).fit(X_train, y_train,early_stopping_rounds=50,eval_set = [(X_val,y_val)])
y_pred_train = rf_clf.predict(X_train)
y_pred_val = rf_clf.predict(X_val)
y_pred_test = rf_clf.predict(X_test)
train_f1 = f1_score(y_train, y_pred_train)
val_f1 = f1_score(y_val, y_pred_val)
test_f1 = f1_score(y_test,y_pred_test)

print(train_f1)
print(val_f1)
print(test_f1)

Model - Decision Tree Classifier

In [None]:
clfs = DecisionTreeClassifier(random_state=44)
decisionTree.fit(X_train,y_train)

y_pred_train = decisionTree.predict(X_train)
y_pred_val = decisionTree.predict(X_val)
y_pred_test = decisionTree.predict(X_test)

train_f1 = f1_score(y_train, y_pred_train)
val_f1 = f1_score(y_val, y_pred_val)
test_f1 = f1_score(y_test,y_pred_test)

print(train_f1)
print(val_f1)
print(test_f1)
print(acc_scores)

Model - Multi Layer Perceptron 

In [None]:
# Define hyperparameters and their search space
param_grid = {
    'hidden_layer_sizes': [(8,), (4,), (2,)],
    'activation': ['logistic', 'tanh', 'relu', 'identity'],
    'solver': ['lbfgs', 'sgd', 'adam'],

    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate' : ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1],
}

mlp = MLPClassifier()

grid_search = GridSearchCV(mlp, param_grid, cv=5, n_jobs=-1, verbose=True)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

best_mlp = grid_search.best_estimator_
test_accuracy = best_mlp.score(X_rest, y_rest)
print("Test Accuracy: ", test_accuracy)
coeffs = best_mlp.coef_

Model - Support Vector Machines

In [None]:
from sklearn import svm
from sklearn import metrics

param_grid = {'C': range(1,500),
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear','sigmoid']}

grid_search = GridSearchCV(svm.SVC(), param_grid, cv=5, n_jobs=-1, verbose=True)
grid_search.fit(X_train, y_train)

#Result of above hyperparameter tuning is hard-coded
clf = svm.SVC(C= 23, gamma= 0.1, kernel= 'linear')

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)

print("F1 score _test:",f1_score(y_test, y_pred))
print("F1 score _train:",f1_score(y_train, y_pred_train))
print(clf.coef_)

BERT + Feed Forward Network

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

dataset = pd.concat([df["Review"], df["Real=1/Fake=0"]], axis=1)
texts = list(data["Review"])
labels = list(data["Real=1/Fake=0"])

In [None]:
input_ids = []
attention_masks = []

for text in texts:
    encoded_dict = tokenizer(text, add_special_tokens=True, max_length=128, padding='max_length', truncation=True, return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

In [None]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
num_batches = len(dataloader)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in dataloader:
        print(batch)
        optimizer.zero_grad()
        input_ids, attention_mask, label = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1} Loss: {total_loss / len(dataloader)}")

# Save the fine-tuned model
model.save_pretrained('/content/gdrive/MyDrive/fine_tuned_bert_model')

In [None]:
encoded_data = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)

with torch.no_grad():
    output = model(**encoded_data)

bert_output = bert_model(input_ids)[0]
logits = output.logits

In [None]:
seed = 45
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
X_train, X_test, y_train, y_test = train_test_split(logits, labels, test_size=0.2, random_state=42)
epochs = 20

class SimpleClassifier(nn.Module):
    def __init__(self, input_dim):
        super(SimpleClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 2)  

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = SimpleClassifier(logits.size(1))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, torch.tensor(y_train))
    loss.backward()
    optimizer.step()

with torch.no_grad():
    test_outputs = model(X_test)
    predicted_labels = torch.argmax(test_outputs, dim=1).tolist()

with torch.no_grad():
    train_outputs = model(X_train)
    predicted_labels_train = torch.argmax(train_outputs, dim=1).tolist()

accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy}")
trainaccuracy = accuracy_score(y_train, predicted_labels_train)
print(f"Accuracy: {trainaccuracy}")


LSTM

In [None]:
max_seq_length = 128
num_classes = 2
input_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_ids")
bert_outputs = bert_model(input_ids)[0]

embedding_dim = 100
num_classes = 2  
model = Sequential()
model.add(LSTM(100),input_shape=(max_seq_length, embedding_dim))
model.add(Dense(1, activation='softmax'))  
learning_rate = 0.0000001  
optimizer = SGD(learning_rate=learning_rate)  
model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Bi-LSTM

In [None]:
max_seq_length = 128
num_classes = 2
input_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_ids")
bert_outputs = bert_model(input_ids)[0]

embedding_dim = 100
num_classes = 2
model = Sequential()
model.add(Bidirectional(LSTM(100),input_shape=(max_seq_length, embedding_dim)))
model.add(Dense(1, activation='softmax')) 
learning_rate = 0.0000001 
optimizer = SGD(learning_rate=learning_rate) 
model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()