In [5]:
#loading the dataset
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Specify the path to your TSV dataset file in Google Drive
file_path = '/content/drive/My Drive/Colab Notebooks/IST 736/deception_data_converted_final.tsv'

# Load the TSV dataset into a pandas DataFrame
df = pd.read_csv(file_path, delimiter='\t')

# Display the first few rows of the DataFrame to check if it loaded successfully
df.head(20)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,lie,sentiment,review
0,f,n,"'Mike\'s Pizza High Point, NY Service was very..."
1,f,n,'i really like this buffet restaurant in Marsh...
2,f,n,"'After I went shopping with some of my friend,..."
3,f,n,'Olive Oil Garden was very disappointing. I ex...
4,f,n,'The Seven Heaven restaurant was never known f...
5,f,n,'I went to XYZ restaurant and had a terrible e...
6,f,n,'I went to ABC restaurant two days ago and I h...
7,f,n,'I went to the Chilis on Erie Blvd and had the...
8,f,n,'OMG. This restaurant is horrible. The recepti...
9,f,n,"'Yesterday, I went to a casino-restaurant call..."


Text Cleaning

In [7]:
import re

def clean_text(text):
    # Remove special characters, including numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert the text to lowercase
    text = text.lower()

    return text

# Apply the clean_text function to the 'review' column
df['cleaned_review'] = df['review'].apply(clean_text)

df.head(10)

Unnamed: 0,lie,sentiment,review,cleaned_review
0,f,n,"'Mike\'s Pizza High Point, NY Service was very...",mikes pizza high point ny service was very slo...
1,f,n,'i really like this buffet restaurant in Marsh...,i really like this buffet restaurant in marsha...
2,f,n,"'After I went shopping with some of my friend,...",after i went shopping with some of my friend w...
3,f,n,'Olive Oil Garden was very disappointing. I ex...,olive oil garden was very disappointing i expe...
4,f,n,'The Seven Heaven restaurant was never known f...,the seven heaven restaurant was never known fo...
5,f,n,'I went to XYZ restaurant and had a terrible e...,i went to xyz restaurant and had a terrible ex...
6,f,n,'I went to ABC restaurant two days ago and I h...,i went to abc restaurant two days ago and i ha...
7,f,n,'I went to the Chilis on Erie Blvd and had the...,i went to the chilis on erie blvd and had the ...
8,f,n,'OMG. This restaurant is horrible. The recepti...,omg this restaurant is horrible the receptioni...
9,f,n,"'Yesterday, I went to a casino-restaurant call...",yesterday i went to a casinorestaurant called ...


Tokenization

In [10]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Example tokenization function
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply tokenization to the 'cleaned_review' column
df['tokenized_review'] = df['cleaned_review'].apply(tokenize_text)
df.head(20)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,lie,sentiment,review,cleaned_review,tokenized_review
0,f,n,"'Mike\'s Pizza High Point, NY Service was very...",mikes pizza high point ny service was very slo...,"[mikes, pizza, high, point, ny, service, was, ..."
1,f,n,'i really like this buffet restaurant in Marsh...,i really like this buffet restaurant in marsha...,"[i, really, like, this, buffet, restaurant, in..."
2,f,n,"'After I went shopping with some of my friend,...",after i went shopping with some of my friend w...,"[after, i, went, shopping, with, some, of, my,..."
3,f,n,'Olive Oil Garden was very disappointing. I ex...,olive oil garden was very disappointing i expe...,"[olive, oil, garden, was, very, disappointing,..."
4,f,n,'The Seven Heaven restaurant was never known f...,the seven heaven restaurant was never known fo...,"[the, seven, heaven, restaurant, was, never, k..."
5,f,n,'I went to XYZ restaurant and had a terrible e...,i went to xyz restaurant and had a terrible ex...,"[i, went, to, xyz, restaurant, and, had, a, te..."
6,f,n,'I went to ABC restaurant two days ago and I h...,i went to abc restaurant two days ago and i ha...,"[i, went, to, abc, restaurant, two, days, ago,..."
7,f,n,'I went to the Chilis on Erie Blvd and had the...,i went to the chilis on erie blvd and had the ...,"[i, went, to, the, chilis, on, erie, blvd, and..."
8,f,n,'OMG. This restaurant is horrible. The recepti...,omg this restaurant is horrible the receptioni...,"[omg, this, restaurant, is, horrible, the, rec..."
9,f,n,"'Yesterday, I went to a casino-restaurant call...",yesterday i went to a casinorestaurant called ...,"[yesterday, i, went, to, a, casinorestaurant, ..."


Lemmatization

In [11]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')  # Download the WordNet dataset

# Instantiate the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Example lemmatization function
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

# Apply lemmatization to the 'cleaned_review' column
df['lemmatized_review'] = df['cleaned_review'].apply(lemmatize_text)
df.head(20)

[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,lie,sentiment,review,cleaned_review,tokenized_review,lemmatized_review
0,f,n,"'Mike\'s Pizza High Point, NY Service was very...",mikes pizza high point ny service was very slo...,"[mikes, pizza, high, point, ny, service, was, ...","[mike, pizza, high, point, ny, service, wa, ve..."
1,f,n,'i really like this buffet restaurant in Marsh...,i really like this buffet restaurant in marsha...,"[i, really, like, this, buffet, restaurant, in...","[i, really, like, this, buffet, restaurant, in..."
2,f,n,"'After I went shopping with some of my friend,...",after i went shopping with some of my friend w...,"[after, i, went, shopping, with, some, of, my,...","[after, i, went, shopping, with, some, of, my,..."
3,f,n,'Olive Oil Garden was very disappointing. I ex...,olive oil garden was very disappointing i expe...,"[olive, oil, garden, was, very, disappointing,...","[olive, oil, garden, wa, very, disappointing, ..."
4,f,n,'The Seven Heaven restaurant was never known f...,the seven heaven restaurant was never known fo...,"[the, seven, heaven, restaurant, was, never, k...","[the, seven, heaven, restaurant, wa, never, kn..."
5,f,n,'I went to XYZ restaurant and had a terrible e...,i went to xyz restaurant and had a terrible ex...,"[i, went, to, xyz, restaurant, and, had, a, te...","[i, went, to, xyz, restaurant, and, had, a, te..."
6,f,n,'I went to ABC restaurant two days ago and I h...,i went to abc restaurant two days ago and i ha...,"[i, went, to, abc, restaurant, two, days, ago,...","[i, went, to, abc, restaurant, two, day, ago, ..."
7,f,n,'I went to the Chilis on Erie Blvd and had the...,i went to the chilis on erie blvd and had the ...,"[i, went, to, the, chilis, on, erie, blvd, and...","[i, went, to, the, chili, on, erie, blvd, and,..."
8,f,n,'OMG. This restaurant is horrible. The recepti...,omg this restaurant is horrible the receptioni...,"[omg, this, restaurant, is, horrible, the, rec...","[omg, this, restaurant, is, horrible, the, rec..."
9,f,n,"'Yesterday, I went to a casino-restaurant call...",yesterday i went to a casinorestaurant called ...,"[yesterday, i, went, to, a, casinorestaurant, ...","[yesterday, i, went, to, a, casinorestaurant, ..."


# MULTINOMIAL NAIVE BAYES MODEL

## Model 1: MNB Model with Tf-Idf vector for sentiment analysis

In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Concatenate tokens or lemmas into a single string for vectorization
df['processed_text'] = df['tokenized_review'].apply(lambda tokens: ' '.join(tokens))
#df.head(20)

# Split the data into training and testing sets where 70% of the data is used for training and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['sentiment'], test_size=0.3, random_state=42)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


# Train the Multinomial Naive Bayes model
mnb_model = MultinomialNB()
mnb_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = mnb_model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9285714285714286

Classification Report:
               precision    recall  f1-score   support

           n       1.00      0.88      0.93        16
           p       0.86      1.00      0.92        12

    accuracy                           0.93        28
   macro avg       0.93      0.94      0.93        28
weighted avg       0.94      0.93      0.93        28


Confusion Matrix:
 [[14  2]
 [ 0 12]]


### Error Analysis

In [50]:
# Create a DataFrame to store the actual labels, predicted labels, and corresponding texts
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Text': X_test})

# Add a column to indicate whether the prediction was correct
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

# Filter the DataFrame to show only instances where the prediction was incorrect
incorrect_predictions = error_analysis_df[error_analysis_df['Correct'] == False]

# Increase the maximum width of displayed columns in pandas
pd.set_option('display.max_colwidth', 100)

# Display all available incorrect predictions for further analysis
print(f"All Incorrect Predictions ({len(incorrect_predictions)} instances):")
print(incorrect_predictions[['Actual', 'Predicted', 'Text']])

All Incorrect Predictions (2 instances):
   Actual Predicted  \
0       n         p   
33      n         p   

                                                                                                   Text  
0   mikes pizza high point ny service was very slow and the quality was low you would think they wou...  
33  this place used to be great i cant believe its current state instead of the cool dimlylit lounge...  


In [None]:
#The incorrect predictions show that the model was gave false positives because of
#a few positive words in the reviews like 'very', 'high' and 'great'

### Feature Analysis

In [117]:
# Get the feature names from the TfidfVectorizer
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()

# Get the log probabilities from the MNB model
log_probs_tfidf = mnb_model.feature_log_prob_

# Identify the top features for binary classification
top_feature_indices_tfidf = log_probs_tfidf[1].argsort()[-min(10, log_probs_tfidf.shape[1]):][::-1]

# Make sure the indices are within the bounds of the feature names
top_feature_indices_tfidf = [i for i in top_feature_indices_tfidf if i < len(feature_names_tfidf)]

# Get the top features
top_features_tfidf = [feature_names_tfidf[i] for i in top_feature_indices_tfidf]

# Print the top 10 features
print("Top 10 features (MNB with TF-IDF Vectorizer):", top_features_tfidf)


Top 10 features (MNB with TF-IDF Vectorizer): ['clerks', 'suitable', 'unique']


In [None]:
#these features give information regarding the context of the reviews which is used for
#sentiment classification

## Model 2: MNB Model with the count vectorizer for sentiment analysis

In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# CountVectorizer with various parameters
count_vectorizer = CountVectorizer(
    lowercase=True,      # Convert all characters to lowercase
    stop_words='english', # Remove common English stopwords
    max_df=0.95,           # Ignore terms that appear in more than 95% of documents (to remove very common terms)
    min_df=2,              # Ignore terms that appear in fewer than 2 documents (to remove rare terms)
    max_features=None,    # No limit on the number of features (terms)
    ngram_range=(1, 1)     # Consider only unigrams (single words)
)

# Fit and transform the training data
X_train_count = count_vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_count = count_vectorizer.transform(X_test)

# Train the Multinomial Naive Bayes model
mnb_model = MultinomialNB()
mnb_model.fit(X_train_count, y_train)

# Make predictions on the test set
y_pred = mnb_model.predict(X_test_count)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8571428571428571

Classification Report:
               precision    recall  f1-score   support

           n       1.00      0.75      0.86        16
           p       0.75      1.00      0.86        12

    accuracy                           0.86        28
   macro avg       0.88      0.88      0.86        28
weighted avg       0.89      0.86      0.86        28


Confusion Matrix:
 [[12  4]
 [ 0 12]]


### Error Analysis

In [53]:
# Create a DataFrame to store the actual labels, predicted labels, and corresponding texts
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Text': X_test})

# Add a column to indicate whether the prediction was correct
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

# Filter the DataFrame to show only instances where the prediction was incorrect
incorrect_predictions = error_analysis_df[error_analysis_df['Correct'] == False]

# Increase the maximum width of displayed columns in pandas
pd.set_option('display.max_colwidth', 200)

# Display all available incorrect predictions for further analysis
print(f"All Incorrect Predictions ({len(incorrect_predictions)} instances):")
print(incorrect_predictions[['Actual', 'Predicted', 'Text']])

All Incorrect Predictions (4 instances):
   Actual Predicted  \
0       n         p   
26      n         p   
33      n         p   
11      n         p   

                                                                                                                                                                                                       Text  
0   mikes pizza high point ny service was very slow and the quality was low you would think they would know at least how to make good pizza not stick to premade dishes like stuffed pasta or a salad yo...  
26  this diner was not at all up to par ive been to many diners and get eggs benedict sometimes there was nacho cheese on my eggs and a plateful of watery runny eggs and it smelled like smoke and ther...  
33  this place used to be great i cant believe its current state instead of the cool dimlylit lounge that i was used to i was in a cheap smelly bar the music has no soul the bartender is mean this pla...  
11  i once went to 

In [None]:
#Here again due to the complex nature of the sentences the model fails to classify correctly

### Feature Analysis

In [115]:
# Get the feature names from the CountVectorizer
feature_names_count = count_vectorizer.get_feature_names_out()

# Get the log probabilities from the MNB model
log_probs_count = mnb_model.feature_log_prob_

# Identify the top features for binary classification
top_feature_indices_count = log_probs_count[1].argsort()[-min(10, log_probs_count.shape[1]):][::-1]
top_features_count = [feature_names_count[i] for i in top_feature_indices_count if i < len(feature_names_count)]

# Print the top 10 features
print("Top 10 features (MNB with Count Vectorizer):", top_features_count)


Top 10 features (MNB with Count Vectorizer): ['clerks', 'suitable', 'unique']


In [None]:
#these features provide some kind of contextual information but are not that insightful

## Model 3: MNB Model using Binary vectorizer for sentiment analysis

In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Binary Vectorization using CountVectorizer with binary=True
binary_vectorizer = CountVectorizer(binary=True)

# Fit and transform the training data
X_train_binary = binary_vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_binary = binary_vectorizer.transform(X_test)

# Train the Multinomial Naive Bayes model
mnb_model = MultinomialNB()
mnb_model.fit(X_train_binary, y_train)

# Make predictions on the test set
y_pred = mnb_model.predict(X_test_binary)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8214285714285714

Classification Report:
               precision    recall  f1-score   support

           n       0.92      0.75      0.83        16
           p       0.73      0.92      0.81        12

    accuracy                           0.82        28
   macro avg       0.83      0.83      0.82        28
weighted avg       0.84      0.82      0.82        28


Confusion Matrix:
 [[12  4]
 [ 1 11]]


### Error Analysis

In [55]:
# Create a DataFrame to store the actual labels, predicted labels, and corresponding texts
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Text': X_test})

# Add a column to indicate whether the prediction was correct
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

# Filter the DataFrame to show only instances where the prediction was incorrect
incorrect_predictions = error_analysis_df[error_analysis_df['Correct'] == False]

# Increase the maximum width of displayed columns in pandas
pd.set_option('display.max_colwidth', 200)

# Display all available incorrect predictions for further analysis
print(f"All Incorrect Predictions ({len(incorrect_predictions)} instances):")
print(incorrect_predictions[['Actual', 'Predicted', 'Text']])

All Incorrect Predictions (5 instances):
   Actual Predicted  \
55      p         n   
0       n         p   
26      n         p   
33      n         p   
11      n         p   

                                                                                                                                                                                                       Text  
55  two days ago i went to the rooftop restaurant in nyc that served brunch it was one of the best brunch that i have ever had the view from the table was serene and i could see both the the hudson ri...  
0   mikes pizza high point ny service was very slow and the quality was low you would think they would know at least how to make good pizza not stick to premade dishes like stuffed pasta or a salad yo...  
26  this diner was not at all up to par ive been to many diners and get eggs benedict sometimes there was nacho cheese on my eggs and a plateful of watery runny eggs and it smelled like smoke and ther..

In [None]:
#Again here the sentences are complex. For the 1st incorrect prediction,
#just because the author gives a subtle dissapointing experiance regarding the
#surrounding, he/she still gives a positive remark about the resturant

### Feature Analysis

In [111]:
# Get the feature names from the Binary Vectorizer
feature_names_binary = binary_vectorizer.get_feature_names_out()

# Get the log probabilities from the MNB model
log_probs_binary = mnb_model.feature_log_prob_

# Identify the top features for binary classification
top_feature_indices_binary = log_probs_binary[1].argsort()[-min(10, log_probs_binary.shape[1]):][::-1]

# Filter indices to avoid out-of-bounds error
top_feature_indices_binary = top_feature_indices_binary[top_feature_indices_binary < len(feature_names_binary)]

# Get the top features
top_features_binary = [feature_names_binary[i] for i in top_feature_indices_binary]

# Print the top 10 features
print("Top 10 features (MNB with Binary Vectorizer):", top_features_binary)


Top 10 features (MNB with Binary Vectorizer): ['clerks', 'suitable', 'unique']


In [None]:
#this does not give much insights as to how the model made decisions regarding the classification decision
#these are neutral words

## Model 4: MNB Model using Tf-Idf vector for authenticity analysis

In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets where 70% of the data is used for training and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['lie'], test_size=0.3, random_state=42)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


# Train the Multinomial Naive Bayes model
mnb_model = MultinomialNB()
mnb_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = mnb_model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.5357142857142857

Classification Report:
               precision    recall  f1-score   support

           f       0.56      0.60      0.58        15
           t       0.50      0.46      0.48        13

    accuracy                           0.54        28
   macro avg       0.53      0.53      0.53        28
weighted avg       0.53      0.54      0.53        28


Confusion Matrix:
 [[9 6]
 [7 6]]


### Error Analysis

In [57]:
# Create a DataFrame to store the actual labels, predicted labels, and corresponding texts
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Text': X_test})

# Add a column to indicate whether the prediction was correct
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

# Filter the DataFrame to show only instances where the prediction was incorrect
incorrect_predictions = error_analysis_df[error_analysis_df['Correct'] == False]

# Increase the maximum width of displayed columns in pandas
pd.set_option('display.max_colwidth', 200)

# Display all available incorrect predictions for further analysis
print(f"All Incorrect Predictions ({len(incorrect_predictions)} instances):")
print(incorrect_predictions[['Actual', 'Predicted', 'Text']])

All Incorrect Predictions (13 instances):
   Actual Predicted  \
0       f         t   
39      t         f   
10      f         t   
44      t         f   
35      t         f   
62      f         t   
28      t         f   
15      f         t   
68      f         t   
30      t         f   
33      t         f   
66      f         t   
69      t         f   

                                                                                                                                                                                                       Text  
0   mikes pizza high point ny service was very slow and the quality was low you would think they would know at least how to make good pizza not stick to premade dishes like stuffed pasta or a salad yo...  
39  this restaurant is quite popular recently went there with two of my friends at pm really long queue we waited for almost minutes to be seated seats were narrow it was too easy to hear clearly what...  
10  last weekend 

In [None]:
#If the tone of the review is a little strong then the review is being wrongly classified for authenticity

### Feature Analysis

In [110]:
# Get the feature names from the TF-IDF Vectorizer
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()

# Get the log probabilities from the MNB model
log_probs_tfidf = mnb_model.feature_log_prob_

# Identify the top features for binary classification
top_feature_indices_tfidf = log_probs_tfidf[1].argsort()[-min(10, log_probs_tfidf.shape[1]):][::-1]

# Filter indices to avoid out-of-bounds error
top_feature_indices_tfidf = top_feature_indices_tfidf[top_feature_indices_tfidf < len(feature_names_tfidf)]

# Get the top features
top_features_tfidf = [feature_names_tfidf[i] for i in top_feature_indices_tfidf]

# Print the top 10 features
print("Top 10 features (MNB with TF-IDF Vectorizer):", top_features_tfidf)


Top 10 features (MNB with TF-IDF Vectorizer): ['clerks', 'suitable', 'unique']


In [None]:
#these are the most frequently occuring words in the text which are used by the model for classification of the authenticity

## Model 5: MNB Model using Count Vectorizer for authenticity analysis

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets where 70% of the data is used for training and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['lie'], test_size=0.3, random_state=42)

# CountVectorizer with various parameters
count_vectorizer = CountVectorizer(
    lowercase=True,      # Convert all characters to lowercase
    stop_words='english', # Remove common English stopwords
    max_df=0.95,           # Ignore terms that appear in more than 95% of documents (to remove very common terms)
    min_df=2,              # Ignore terms that appear in fewer than 2 documents (to remove rare terms)
    max_features=None,    # No limit on the number of features (terms)
    ngram_range=(1, 1)     # Consider only unigrams (single words)
)

# Fit and transform the training data
X_train_count = count_vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_count = count_vectorizer.transform(X_test)

# Train the Multinomial Naive Bayes model
mnb_model = MultinomialNB()
mnb_model.fit(X_train_count, y_train)

# Make predictions on the test set
y_pred = mnb_model.predict(X_test_count)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.5714285714285714

Classification Report:
               precision    recall  f1-score   support

           f       0.64      0.47      0.54        15
           t       0.53      0.69      0.60        13

    accuracy                           0.57        28
   macro avg       0.58      0.58      0.57        28
weighted avg       0.59      0.57      0.57        28


Confusion Matrix:
 [[7 8]
 [4 9]]


### Error Analysis

In [59]:
# Create a DataFrame to store the actual labels, predicted labels, and corresponding texts
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Text': X_test})

# Add a column to indicate whether the prediction was correct
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

# Filter the DataFrame to show only instances where the prediction was incorrect
incorrect_predictions = error_analysis_df[error_analysis_df['Correct'] == False]

# Increase the maximum width of displayed columns in pandas
pd.set_option('display.max_colwidth', 200)

# Display all available incorrect predictions for further analysis
print(f"All Incorrect Predictions ({len(incorrect_predictions)} instances):")
print(incorrect_predictions[['Actual', 'Predicted', 'Text']])

All Incorrect Predictions (12 instances):
   Actual Predicted  \
55      f         t   
39      t         f   
67      f         t   
10      f         t   
62      f         t   
28      t         f   
49      f         t   
15      f         t   
68      f         t   
33      t         f   
66      f         t   
69      t         f   

                                                                                                                                                                                                       Text  
55  two days ago i went to the rooftop restaurant in nyc that served brunch it was one of the best brunch that i have ever had the view from the table was serene and i could see both the the hudson ri...  
39  this restaurant is quite popular recently went there with two of my friends at pm really long queue we waited for almost minutes to be seated seats were narrow it was too easy to hear clearly what...  
67  its hard to pick a favorite dining e

In [None]:
#same observation is made here. If the review is written in an exaggerated way then it is being classified
#wrongly

### Feature Analysis

In [103]:
# Get the feature names from the CountVectorizer
feature_names_count = count_vectorizer.get_feature_names_out()

# Get the log probabilities from the MNB model
log_probs_count = mnb_model.feature_log_prob_



# Identify the top features for binary classification
top_feature_indices_count = log_probs_count[1].argsort()[-min(10, log_probs_count.shape[1]):][::-1]

# Ensure that the indices are within bounds
top_feature_indices_count = [idx for idx in top_feature_indices_count if idx < len(feature_names_count)]

# Retrieve the top features
top_features_count = [feature_names_count[i] for i in top_feature_indices_count]

# Print the top 10 features
print("Top 10 features (MNB with Count Vectorizer):", top_features_count)




Top 10 features (MNB with Count Vectorizer): ['clerks', 'suitable', 'unique']


In [None]:
#these words could be considered as the context words used by the model to identify the authenticity of the text

## Model 6: MNB Model with Bigram vectorizer for authenticity Analysis

In [104]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets where 70% of the data is used for training and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['lie'], test_size=0.3, random_state=42)

# Bigram Vectorization
#I am using the bigram vectorizer for authenticity analysis so that the context of the data is stored
#context I believe is important to classify fake and true reviews
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
X_train_bigram = bigram_vectorizer.fit_transform(X_train)
X_test_bigram = bigram_vectorizer.transform(X_test)

# Multinomial Naive Bayes model
mnb_model = MultinomialNB()

# Train the MNB model on bigram vectors
mnb_model.fit(X_train_bigram, y_train)

# Make predictions on the test set
y_pred = mnb_model.predict(X_test_bigram)

# Evaluate the MNB model
print("Accuracy (MNB with Bigram Vectorizer):", accuracy_score(y_test, y_pred))
print("\nClassification Report (MNB with Bigram Vectorizer):\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix (MNB with Bigram Vectorizer):\n", confusion_matrix(y_test, y_pred))

Accuracy (MNB with Bigram Vectorizer): 0.42857142857142855

Classification Report (MNB with Bigram Vectorizer):
               precision    recall  f1-score   support

           f       0.47      0.53      0.50        15
           t       0.36      0.31      0.33        13

    accuracy                           0.43        28
   macro avg       0.42      0.42      0.42        28
weighted avg       0.42      0.43      0.42        28


Confusion Matrix (MNB with Bigram Vectorizer):
 [[8 7]
 [9 4]]


### Error Analysis

In [105]:
# Create a DataFrame to store the actual labels, predicted labels, and corresponding texts
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Text': X_test})

# Add a column to indicate whether the prediction was correct
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

# Filter the DataFrame to show only instances where the prediction was incorrect
incorrect_predictions = error_analysis_df[error_analysis_df['Correct'] == False]

# Increase the maximum width of displayed columns in pandas
pd.set_option('display.max_colwidth', 200)

# Display all available incorrect predictions for further analysis
print(f"All Incorrect Predictions ({len(incorrect_predictions)} instances):")
print(incorrect_predictions[['Actual', 'Predicted', 'Text']])

All Incorrect Predictions (16 instances):
   Actual Predicted  \
72      t         f   
39      t         f   
67      f         t   
10      f         t   
44      t         f   
35      t         f   
62      f         t   
28      t         f   
49      f         t   
68      f         t   
78      t         f   
30      t         f   
33      t         f   
11      f         t   
66      f         t   
69      t         f   

                                                                                                                                                                                                       Text  
72  stronghearts cafe is the best the owners have a great ethic and the food is to die for i had a pumpkin espresso milkshake named after albert einstein and it was only winning the food though vegan ...  
39  this restaurant is quite popular recently went there with two of my friends at pm really long queue we waited for almost minutes to be seated seats we

In [106]:
#again due to the complex sentence structures and mis-interpretation
#of sublities the reviews are being wrongly classified

### Feature Analysis

In [107]:
# Get feature names from the bigram vectorizer's vocabulary
feature_names_bigram = bigram_vectorizer.get_feature_names_out()

# Get probabilities from the trained MNB model
probabilities_bigram = mnb_model.feature_log_prob_

# Identify the top features for authenticity classification
top_feature_indices_bigram = probabilities_bigram[1].argsort()[-min(10, len(feature_names_bigram)):][::-1]
top_features_bigram = [feature_names_bigram[i] for i in top_feature_indices_bigram]

print("Top 10 features (MNB with Bigram Vectorizer):", top_features_bigram)


Top 10 features (MNB with Bigram Vectorizer): ['the food', 'and the', 'the place', 'of the', 'food was', 'in the', 'the best', 'from the', 'the worst', 'the restaurant']


In [108]:
#The top 10 bigrams used by the model for authenticity classification indicates that it gives more significance
#for those bigrams which have a mention about 'food' in them. The SVM model on the other hand uses ambience information as well
#but the MNB model uses mostly the information reagarding the food

# SUPPORT VECTOR MACHINE

## Model 7: SVM Model for sentiment analysis using tf-idf model

In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['sentiment'], test_size=0.3, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Support Vector Machine (SVM) model with a linear kernel
svm_model = SVC(kernel='linear')  # Linear kernel is commonly used for text classification

# Train the SVM model
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8928571428571429

Classification Report:
               precision    recall  f1-score   support

           n       0.93      0.88      0.90        16
           p       0.85      0.92      0.88        12

    accuracy                           0.89        28
   macro avg       0.89      0.90      0.89        28
weighted avg       0.90      0.89      0.89        28


Confusion Matrix:
 [[14  2]
 [ 1 11]]


### Error Analysis

In [63]:
# Create a DataFrame to store the actual labels, predicted labels, and corresponding texts
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Text': X_test})

# Add a column to indicate whether the prediction was correct
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

# Filter the DataFrame to show only instances where the prediction was incorrect
incorrect_predictions = error_analysis_df[error_analysis_df['Correct'] == False]

# Increase the maximum width of displayed columns in pandas
pd.set_option('display.max_colwidth', 200)

# Display all available incorrect predictions for further analysis
print(f"All Incorrect Predictions ({len(incorrect_predictions)} instances):")
print(incorrect_predictions[['Actual', 'Predicted', 'Text']])

All Incorrect Predictions (3 instances):
   Actual Predicted  \
0       n         p   
78      p         n   
33      n         p   

                                                                                                                                                                                                       Text  
0   mikes pizza high point ny service was very slow and the quality was low you would think they would know at least how to make good pizza not stick to premade dishes like stuffed pasta or a salad yo...  
78  i went to cruise dinner in nyc with spirit cruises it was by far the best dining experience i have ever had in my life the cruise kicked off by taking us uptown on the hudson river there was a bar...  
33  this place used to be great i cant believe its current state instead of the cool dimlylit lounge that i was used to i was in a cheap smelly bar the music has no soul the bartender is mean this pla...  


In [None]:
#as we can see for the same dataset and the same vectorizer, tf-idf,
#if the model used was MNB, then there are lesser incorrect predictions than
#if the same was built using SVM.

#here in the 3rd instance, because of the negative words, the review is wrongly
#classified

### Feature Analysis

In [94]:
# Get feature names from the TF-IDF Vectorizer's vocabulary
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()

# Get coefficients from the trained SVM model
coefficients_tfidf = svm_model.coef_.toarray()[0]

# Identify the top features for sentiment classification
top_feature_indices_tfidf = coefficients_tfidf.argsort()[-min(10, len(feature_names_tfidf)):][::-1]
top_features_tfidf = [feature_names_tfidf[i] for i in top_feature_indices_tfidf]

print("Top 10 features (Linear Kernel, TF-IDF Vectorizer):", top_features_tfidf)


Top 10 features (Linear Kernel, TF-IDF Vectorizer): ['is', 'you', 'good', 'have', 'restaurant', 'environment', 'up', 'been', 'each', 'people']


In [None]:
#sentiment analysis model uses most frequently appearing context words like
#resturant, environment, people to categorize into different sentiment classes

In [29]:
# Support Vector Machine (SVM) model with RBF kernel
svm_model_rbf = SVC(kernel='rbf', gamma='auto')  # Adjust gamma based on your data

# Train the SVM model with RBF kernel
svm_model_rbf.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred_rbf = svm_model_rbf.predict(X_test_tfidf)

# Evaluate the model with RBF kernel
print("Accuracy (RBF Kernel):", accuracy_score(y_test, y_pred_rbf))
print("\nClassification Report (RBF Kernel):\n", classification_report(y_test, y_pred_rbf))
print("\nConfusion Matrix (RBF Kernel):\n", confusion_matrix(y_test, y_pred_rbf))

Accuracy (RBF Kernel): 0.42857142857142855

Classification Report (RBF Kernel):
               precision    recall  f1-score   support

           n       0.00      0.00      0.00        16
           p       0.43      1.00      0.60        12

    accuracy                           0.43        28
   macro avg       0.21      0.50      0.30        28
weighted avg       0.18      0.43      0.26        28


Confusion Matrix (RBF Kernel):
 [[ 0 16]
 [ 0 12]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#As we can see SVM Model with kernel='linear' parameter is the better than setting that parameter to any other value
#This indicates that there is a linear relationship between the features and labels and it fits best for the given dataset

## Model 8: SVM Model for sentiment analysis using count vectorizer

In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# Support Vector Machine (SVM) model with a linear kernel
svm_model_linear = SVC(kernel='linear')

# Train the SVM model with linear kernel
svm_model_linear.fit(X_train_count, y_train)

# Make predictions on the test set
y_pred_linear = svm_model_linear.predict(X_test_count)

# Evaluate the model with linear kernel and Count Vectorizer
print("Accuracy (Linear Kernel, Count Vectorizer):", accuracy_score(y_test, y_pred_linear))
print("\nClassification Report (Linear Kernel, Count Vectorizer):\n", classification_report(y_test, y_pred_linear))
print("\nConfusion Matrix (Linear Kernel, Count Vectorizer):\n", confusion_matrix(y_test, y_pred_linear))

Accuracy (Linear Kernel, Count Vectorizer): 0.8571428571428571

Classification Report (Linear Kernel, Count Vectorizer):
               precision    recall  f1-score   support

           n       0.88      0.88      0.88        16
           p       0.83      0.83      0.83        12

    accuracy                           0.86        28
   macro avg       0.85      0.85      0.85        28
weighted avg       0.86      0.86      0.86        28


Confusion Matrix (Linear Kernel, Count Vectorizer):
 [[14  2]
 [ 2 10]]


### Error Analysis

In [65]:
# Create a DataFrame to store the actual labels, predicted labels, and corresponding texts
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Text': X_test})

# Add a column to indicate whether the prediction was correct
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

# Filter the DataFrame to show only instances where the prediction was incorrect
incorrect_predictions = error_analysis_df[error_analysis_df['Correct'] == False]

# Increase the maximum width of displayed columns in pandas
pd.set_option('display.max_colwidth', 200)

# Display all available incorrect predictions for further analysis
print(f"All Incorrect Predictions ({len(incorrect_predictions)} instances):")
print(incorrect_predictions[['Actual', 'Predicted', 'Text']])

All Incorrect Predictions (3 instances):
   Actual Predicted  \
0       n         p   
78      p         n   
33      n         p   

                                                                                                                                                                                                       Text  
0   mikes pizza high point ny service was very slow and the quality was low you would think they would know at least how to make good pizza not stick to premade dishes like stuffed pasta or a salad yo...  
78  i went to cruise dinner in nyc with spirit cruises it was by far the best dining experience i have ever had in my life the cruise kicked off by taking us uptown on the hudson river there was a bar...  
33  this place used to be great i cant believe its current state instead of the cool dimlylit lounge that i was used to i was in a cheap smelly bar the music has no soul the bartender is mean this pla...  


In [None]:
#this is similar to the MNB model

### Feature Analysis

In [93]:
# Get feature names from the Count Vectorizer's vocabulary
feature_names_count = count_vectorizer.get_feature_names_out()

# Get coefficients from the trained SVM model
coefficients_count = svm_model_linear.coef_.toarray()[0]

# Identify the top features for binary classification
top_feature_indices_count = coefficients_count.argsort()[-min(10, len(feature_names_count)):][::-1]
top_features_count = [feature_names_count[i] for i in top_feature_indices_count]

print("Top 10 features (Linear Kernel, Count Vectorizer):", top_features_count)


Top 10 features (Linear Kernel, Count Vectorizer): ['indian', 'is', 'their', 'was', 'on', 'cheese', 'up', 'you', 'environment', 'been']


In [None]:
#the specifics of the text gives more insights regarding the class of sentiment
#the review belongs to

## Model 9: SVM Model for sentiment analysis using Bigram vectorizer

In [66]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Binary Vectorization
binary_vectorizer = CountVectorizer(binary=True)
X_train_binary = binary_vectorizer.fit_transform(X_train)
X_test_binary = binary_vectorizer.transform(X_test)

# Support Vector Machine (SVM) model with a linear kernel
svm_model = SVC(kernel='linear')

# Train the SVM model with linear kernel on binary vectors
svm_model.fit(X_train_binary, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_binary)

# Evaluate the SVM model
print("Accuracy (SVM with Binary Vectorizer):", accuracy_score(y_test, y_pred))
print("\nClassification Report (SVM with Binary Vectorizer):\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix (SVM with Binary Vectorizer):\n", confusion_matrix(y_test, y_pred))


Accuracy (SVM with Binary Vectorizer): 0.7857142857142857

Classification Report (SVM with Binary Vectorizer):
               precision    recall  f1-score   support

           n       0.86      0.75      0.80        16
           p       0.71      0.83      0.77        12

    accuracy                           0.79        28
   macro avg       0.79      0.79      0.78        28
weighted avg       0.80      0.79      0.79        28


Confusion Matrix (SVM with Binary Vectorizer):
 [[12  4]
 [ 2 10]]


### Error Analysis

In [67]:
# Create a DataFrame to store the actual labels, predicted labels, and corresponding texts
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Text': X_test})

# Add a column to indicate whether the prediction was correct
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

# Filter the DataFrame to show only instances where the prediction was incorrect
incorrect_predictions = error_analysis_df[error_analysis_df['Correct'] == False]

# Increase the maximum width of displayed columns in pandas
pd.set_option('display.max_colwidth', 200)

# Display all available incorrect predictions for further analysis
print(f"All Incorrect Predictions ({len(incorrect_predictions)} instances):")
print(incorrect_predictions[['Actual', 'Predicted', 'Text']])

All Incorrect Predictions (6 instances):
   Actual Predicted  \
55      p         n   
0       n         p   
10      n         p   
62      p         n   
33      n         p   
11      n         p   

                                                                                                                                                                                                       Text  
55  two days ago i went to the rooftop restaurant in nyc that served brunch it was one of the best brunch that i have ever had the view from the table was serene and i could see both the the hudson ri...  
0   mikes pizza high point ny service was very slow and the quality was low you would think they would know at least how to make good pizza not stick to premade dishes like stuffed pasta or a salad yo...  
10  last weekend i went to a place called ratastic and i wasnt shocked to be in the company of numerous rats i would expect someone to shut the place down but it seems like everyo

In [68]:
#there are far more incorrect predictions
#complex structure of the sentences makes it difficult for the model to classify

### Feature Analysis

In [92]:
# Get feature names from the Binary Vectorizer's vocabulary
feature_names_binary = binary_vectorizer.get_feature_names_out()

# Get coefficients from the trained SVM model
coefficients_binary = svm_model.coef_.toarray()[0]

# Identify the top features for binary classification
top_feature_indices_binary = coefficients_binary.argsort()[-min(10, len(feature_names_binary)):][::-1]
top_features_binary = [feature_names_binary[i] for i in top_feature_indices_binary]

print("Top 10 features (Linear Kernel, Binary Vectorizer):", top_features_binary)


Top 10 features (Linear Kernel, Binary Vectorizer): ['is', 'you', 'good', 'have', 'restaurant', 'environment', 'up', 'been', 'each', 'people']


In [None]:
#the obvious positive words like 'good' are influencing the model's classification decision
#heavily. Most other words give insights regarding the context which led to such a positive or negative review.

## Model 10: SVM Model for authenticity analysis using Tf-Idf vectorizer

In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['lie'], test_size=0.3, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Support Vector Machine (SVM) model with a linear kernel
svm_model = SVC(kernel='linear')  # Linear kernel is commonly used for text classification

# Train the SVM model
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.5714285714285714

Classification Report:
               precision    recall  f1-score   support

           f       0.62      0.53      0.57        15
           t       0.53      0.62      0.57        13

    accuracy                           0.57        28
   macro avg       0.57      0.57      0.57        28
weighted avg       0.58      0.57      0.57        28


Confusion Matrix:
 [[8 7]
 [5 8]]


### Error Analysis

In [70]:
# Create a DataFrame to store the actual labels, predicted labels, and corresponding texts
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Text': X_test})

# Add a column to indicate whether the prediction was correct
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

# Filter the DataFrame to show only instances where the prediction was incorrect
incorrect_predictions = error_analysis_df[error_analysis_df['Correct'] == False]

# Increase the maximum width of displayed columns in pandas
pd.set_option('display.max_colwidth', 200)

# Display all available incorrect predictions for further analysis
print(f"All Incorrect Predictions ({len(incorrect_predictions)} instances):")
print(incorrect_predictions[['Actual', 'Predicted', 'Text']])

All Incorrect Predictions (12 instances):
   Actual Predicted  \
0       f         t   
39      t         f   
10      f         t   
44      t         f   
35      t         f   
62      f         t   
28      t         f   
49      f         t   
15      f         t   
68      f         t   
66      f         t   
69      t         f   

                                                                                                                                                                                                       Text  
0   mikes pizza high point ny service was very slow and the quality was low you would think they would know at least how to make good pizza not stick to premade dishes like stuffed pasta or a salad yo...  
39  this restaurant is quite popular recently went there with two of my friends at pm really long queue we waited for almost minutes to be seated seats were narrow it was too easy to hear clearly what...  
10  last weekend i went to a place calle

In [None]:
#the model gives multiple incorrect predictions because of the complex structure of the sentence

### Feature Analysis

In [91]:
# Use the TF-IDF Vectorizer to transform both training and testing data
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train the SVM model with linear kernel on TF-IDF vectors
svm_model.fit(X_train_tfidf, y_train)

# Get coefficients from the trained SVM model
coefficients_tfidf = svm_model.coef_.toarray()[0]

# Identify the top features for binary classification
top_feature_indices_tfidf = coefficients_tfidf.argsort()[-min(10, len(feature_names_tfidf)):][::-1]
top_features_tfidf = [feature_names_tfidf[i] for i in top_feature_indices_tfidf]

print("Top 10 features (Linear Kernel, TF-IDF Vectorizer):", top_features_tfidf)


Top 10 features (Linear Kernel, TF-IDF Vectorizer): ['is', 'you', 'good', 'have', 'restaurant', 'environment', 'up', 'been', 'each', 'people']


In [None]:
#Again here words which are important for understanding the context of the text are
#used to classify by the model

## Model 11: SVM Model for authenticity analysis using the Count vectorizer





In [71]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['lie'], test_size=0.3, random_state=42)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# Support Vector Machine (SVM) model with a linear kernel
svm_model_linear = SVC(kernel='linear')

# Train the SVM model with linear kernel
svm_model_linear.fit(X_train_count, y_train)

# Make predictions on the test set
y_pred_linear = svm_model_linear.predict(X_test_count)

# Evaluate the model with linear kernel and Count Vectorizer
print("Accuracy (Linear Kernel, Count Vectorizer):", accuracy_score(y_test, y_pred_linear))
print("\nClassification Report (Linear Kernel, Count Vectorizer):\n", classification_report(y_test, y_pred_linear))
print("\nConfusion Matrix (Linear Kernel, Count Vectorizer):\n", confusion_matrix(y_test, y_pred_linear))

Accuracy (Linear Kernel, Count Vectorizer): 0.5

Classification Report (Linear Kernel, Count Vectorizer):
               precision    recall  f1-score   support

           f       0.55      0.40      0.46        15
           t       0.47      0.62      0.53        13

    accuracy                           0.50        28
   macro avg       0.51      0.51      0.50        28
weighted avg       0.51      0.50      0.49        28


Confusion Matrix (Linear Kernel, Count Vectorizer):
 [[6 9]
 [5 8]]


### Error Analysis

In [72]:
# Create a DataFrame to store the actual labels, predicted labels, and corresponding texts
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Text': X_test})

# Add a column to indicate whether the prediction was correct
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

# Filter the DataFrame to show only instances where the prediction was incorrect
incorrect_predictions = error_analysis_df[error_analysis_df['Correct'] == False]

# Increase the maximum width of displayed columns in pandas
pd.set_option('display.max_colwidth', 200)

# Display all available incorrect predictions for further analysis
print(f"All Incorrect Predictions ({len(incorrect_predictions)} instances):")
print(incorrect_predictions[['Actual', 'Predicted', 'Text']])

All Incorrect Predictions (12 instances):
   Actual Predicted  \
0       f         t   
39      t         f   
10      f         t   
44      t         f   
35      t         f   
62      f         t   
28      t         f   
49      f         t   
15      f         t   
68      f         t   
66      f         t   
69      t         f   

                                                                                                                                                                                                       Text  
0   mikes pizza high point ny service was very slow and the quality was low you would think they would know at least how to make good pizza not stick to premade dishes like stuffed pasta or a salad yo...  
39  this restaurant is quite popular recently went there with two of my friends at pm really long queue we waited for almost minutes to be seated seats were narrow it was too easy to hear clearly what...  
10  last weekend i went to a place calle

In [None]:
#authenticity clasification gives multiple incorrect predictions irrespective of the model used

### Feature Analysis

In [82]:
# Extract feature names from Count Vectorizer
feature_names_count = count_vectorizer.get_feature_names_out()

# Get coefficients from the SVM model with linear kernel
coefficients_linear = svm_model_linear.coef_.toarray()

# Identify the top features for binary classification
top_feature_indices_linear = coefficients_linear[0].argsort()[-10:][::-1]
top_features_linear = [feature_names_count[i] for i in top_feature_indices_linear]

print("Top 10 features (Linear Kernel, Count Vectorizer):", top_features_linear)


Top 10 features (Linear Kernel, Count Vectorizer): ['indian', 'is', 'their', 'was', 'on', 'cheese', 'up', 'you', 'environment', 'been']


In [None]:
#the occurance of words like 'indian', 'cheese' shows that the model considers
#specifics of the reviews before classfying. Also words like 'environment' speaks about the overall
#ambeiance of the resturant and that is always considered by an user before giving an authentic review
#other worsd like 'is', 'was', 'on' are occuring here because of how frequently they appear in the dataset

## Model 12: SVM Model for authenticity analysis using the bigram vectorizer

In [76]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['lie'], test_size=0.3, random_state=42)

# Bigram Vectorization
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
X_train_bigram = bigram_vectorizer.fit_transform(X_train)
X_test_bigram = bigram_vectorizer.transform(X_test)

# Support Vector Machine (SVM) model with a linear kernel
svm_model = SVC(kernel='linear')

# Train the SVM model with linear kernel on bigram vectors
svm_model.fit(X_train_bigram, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_bigram)

# Evaluate the model
print("Accuracy (Bigram Vectorizer):", accuracy_score(y_test, y_pred))
print("\nClassification Report (Bigram Vectorizer):\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix (Bigram Vectorizer):\n", confusion_matrix(y_test, y_pred))

Accuracy (Bigram Vectorizer): 0.42857142857142855

Classification Report (Bigram Vectorizer):
               precision    recall  f1-score   support

           f       0.40      0.13      0.20        15
           t       0.43      0.77      0.56        13

    accuracy                           0.43        28
   macro avg       0.42      0.45      0.38        28
weighted avg       0.42      0.43      0.37        28


Confusion Matrix (Bigram Vectorizer):
 [[ 2 13]
 [ 3 10]]


### Error Analysis

In [74]:
# Create a DataFrame to store the actual labels, predicted labels, and corresponding texts
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Text': X_test})

# Add a column to indicate whether the prediction was correct
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

# Filter the DataFrame to show only instances where the prediction was incorrect
incorrect_predictions = error_analysis_df[error_analysis_df['Correct'] == False]

# Increase the maximum width of displayed columns in pandas
pd.set_option('display.max_colwidth', 200)

# Display all available incorrect predictions for further analysis
print(f"All Incorrect Predictions ({len(incorrect_predictions)} instances):")
print(incorrect_predictions[['Actual', 'Predicted', 'Text']])

All Incorrect Predictions (16 instances):
   Actual Predicted  \
22      f         t   
72      t         f   
0       f         t   
67      f         t   
10      f         t   
44      t         f   
62      f         t   
12      f         t   
4       f         t   
18      f         t   
49      f         t   
15      f         t   
68      f         t   
78      t         f   
11      f         t   
66      f         t   

                                                                                                                                                                                                       Text  
22  i recently ate at a restaurant called white castle and it was a dine in i had to wait minutes before the waiter came to my table to take my order even though it was not busy i had to wait another ...  
72  stronghearts cafe is the best the owners have a great ethic and the food is to die for i had a pumpkin espresso milkshake named after albert einstein 

In [None]:
#bigram vectorizer with svm model gives the most number of incorrect predictions

### Feature Analysis

In [79]:
# Extract feature names from bigram vectorizer
feature_names = bigram_vectorizer.get_feature_names_out()

# Get coefficients from the SVM model
coefficients = svm_model.coef_.toarray()

# Identify the top features for the binary classification
top_feature_indices = coefficients[0].argsort()[-10:][::-1]
top_features = [feature_names[i] for i in top_feature_indices]

print("Top 10 features:", top_features)


Top 10 features: ['restaurant have', 'the environment', 'at this', 'best restaurant', 'need to', 'friends the', 'makes me', 'is so', 'of people', 'people the']


In [None]:
#These features indicate that the model takes all these bigrams as the most important
#to classify based on the authenticity. This indicates that the context words in the text
#are considered for the classification like 'friends the', 'of people'.