In [1]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
import csv
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

df = pd.read_csv('/input/preprocessed.csv')
df.shape
print(df.label.value_counts())

pos_df=df.loc[df.label==1,:][:6450]
neg_df=df.loc[df.label==0,:][:3225]
df=pd.concat([pos_df,neg_df],ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
df.head()
print(df.label.value_counts())

#Bag of Words
def bow(train, test):
    vectorizer = CountVectorizer(analyzer='word')
    train_data = vectorizer.fit_transform(train)
    test_data = vectorizer.transform(test)
    feature_names = vectorizer.get_feature_names()
    #print(feature_names)
    return train_data, test_data, feature_names

# Feature selection chi2 method
def ch2(x_train, y_train, x_test, feature_names):
    ch2 = SelectKBest(chi2, k=100)
    x_train_features = ch2.fit_transform(x_train, y_train)
    x_test_features = ch2.transform(x_test)
    feature_indices = ch2.get_support(indices=True)
    selected_features = [feature_names[i] for i in ch2.get_support(indices=True)]
    return x_train_features, x_test_features, feature_indices, selected_features

# Feature selection mutual_info_classif
def mutual_info(x_train, y_train, x_test, feature_names):
    mutual = SelectKBest(mutual_info_classif, k=100)
    x_train_features = mutual.fit_transform(x_train, y_train)
    x_test_features = mutual.transform(x_test)
    feature_indices = mutual.get_support(indices=True)
    selected_features = [feature_names[i] for i in mutual.get_support(indices=True)]
    return x_train_features, x_test_features, feature_indices, selected_features

#TFIDF VECTORIZATION
def tfidf(train, test):
    vectorizer = TfidfTransformer()
    train_tfidf = vectorizer.fit_transform(train)
    test_tfidf = vectorizer.transform(test)
    return train_tfidf, test_tfidf    

# NO FEATURE SELECTION
def nofeatureselection(x_train, x_test, y_train):
    #TFIDF
    vectorizer = TfidfVectorizer(use_idf=True,
                                 analyzer='word',
                                 strip_accents='unicode')
    tfidf_train = vectorizer.fit_transform(x_train)
    tfidf_test = vectorizer.transform(x_test)
    feature_names = vectorizer.get_feature_names()
    feat = SelectKBest(k=100)
    x_train_features = feat.fit_transform(tfidf_train, y_train)
    x_test_features = feat.transform(tfidf_test)
    feature_names = [feature_names[i] for i in feat.get_support(indices=True)]
    return x_train_features, x_test_features, feature_names

#MULTINOMIAL NAIVE BAYES
def Multinomial(train_tfidf, y_train, test_tfidf, y_test):
    model = MultinomialNB()
    model.fit(train_tfidf, y_train)
    expected = y_test
    predicted = model.predict(test_tfidf)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    return expected, predicted

def svm(train_tfidf, y_train, test_tfidf, y_test):
    svc = SVC(kernel='linear', C = 1.0)
    svc.fit(train_tfidf, y_train)
    predicted = svc.predict(test_tfidf)
    expected = y_test
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    return expected, predicted

def randomforest(train_tfidf, y_train, test_tfidf, y_test):
    clf = RandomForestClassifier()
    clf.fit(train_tfidf, y_train)
    expected = y_test
    predicted = clf.predict(test_tfidf)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    return expected, predicted

def naivebayes_withfs(x_train, x_test, y_train, y_test):
    # MULTINOMIAL NAIVE BAYES WITH FEATURE SELECTION
    # Feature Extraction
    x_train, x_test, feature_names = bow(x_train, x_test)
    # Feature Selection
    # Select best 50 features according to chi square score
    x_train_features, x_test_features, feature_indices, selected_features = ch2(x_train, y_train, x_test, feature_names)
    # TFIDF - Vecctorization
    train_tfidf, test_tfidf = tfidf(x_train_features, x_test_features)
    # Model - Multinomial Naive Bayes
    print("Results of Multinomial Naive Bayes Classifier with Feature Selection")
    print("Features:\n", selected_features)
    expectednb_wfs, predictednb_wfs = Multinomial(train_tfidf, y_train, test_tfidf, y_test)
    return expectednb_wfs, predictednb_wfs

def naivebayes_withfs2(x_train, x_test, y_train, y_test):
    # MULTINOMIAL NAIVE BAYES WITH FEATURE SELECTION
    # Feature Extraction
    x_train, x_test, feature_names = bow(x_train, x_test)
    # Feature Selection
    # Select best 50 features according to chi square score
    x_train_features, x_test_features, feature_indices, selected_features = mutual_info(x_train, y_train, x_test, feature_names)
    # TFIDF - Vecctorization
    train_tfidf, test_tfidf = tfidf(x_train_features, x_test_features)
    # Model - Multinomial Naive Bayes
    print("Results of Multinomial Naive Bayes Classifier with Feature Selection")
    print("Features:\n", selected_features)
    expectednb_wfs, predictednb_wfs = Multinomial(train_tfidf, y_train, test_tfidf, y_test)
    return expectednb_wfs, predictednb_wfs

def naivebayes_nfs(x_train, x_test, y_train, y_test):
    # MULTIONAMIAL NAIVE BAYES WITHOUT FEATURE SELECTION
    x_train_features, x_test_features, feature_names = nofeatureselection(x_train, x_test, y_train)
    # Model - Multinomial Naive Bayes
    print("Results of Multinomial Naive Bayes Classifier without Feature Selection")
    print("Features:\n", feature_names)
    expectednb_nfs, predictednb_nfs = Multinomial(x_train_features, y_train, x_test_features, y_test)
    return expectednb_nfs, predictednb_nfs


def svm_wfs(x_train, x_test, y_train, y_test):
    # SVM WITH FEATURE SELECTION
    # Feature Extraction
    x_train, x_test, feature_names = bow(x_train, x_test)
    # Feature Selection
    # Select best 50 features according to chi square score
    x_train_features, x_test_features, feature_indices, selected_features = ch2(x_train, y_train, x_test, feature_names)
    # TFIDF - Vecctorization
    train_tfidf, test_tfidf = tfidf(x_train_features, x_test_features)
    # Model - SVM
    print("Results of SVM with Feature Selection")
    print("Features:\n", selected_features)
    expectedsvm_wfs, predictedsvm_wfs = svm(train_tfidf, y_train, test_tfidf, y_test)
    return expectedsvm_wfs, predictedsvm_wfs

def svm_wfs2(x_train, x_test, y_train, y_test):
    # SVM WITH FEATURE SELECTION
    # Feature Extraction
    x_train, x_test, feature_names = bow(x_train, x_test)
    # Feature Selection
    # Select best 50 features according to chi square score
    x_train_features, x_test_features, feature_indices, selected_features = mutual_info(x_train, y_train, x_test, feature_names)
    # TFIDF - Vecctorization
    train_tfidf, test_tfidf = tfidf(x_train_features, x_test_features)
    # Model - SVM
    print("Results of SVM with Feature Selection")
    print("Features:\n", selected_features)
    expectedsvm_wfs, predictedsvm_wfs = svm(train_tfidf, y_train, test_tfidf, y_test)
    return expectedsvm_wfs, predictedsvm_wfs

def svm_nfs(x_train, x_test, y_train, y_test):
    # SVM WITHOUT FEATURE SELECTION
    x_train_features, x_test_features, feature_names = nofeatureselection(x_train, x_test, y_train)
    # Model - SVM
    print("Results of SVM Classifier without Feature Selection")
    print("Features:\n", feature_names)
    expectedsvm_nfs, predictedsvm_nfs = svm(x_train_features, y_train, x_test_features, y_test)
    return expectedsvm_nfs, predictedsvm_nfs

# RANDOM FOREST CLASSIFIER WITH FEATURE SELECTION - CHI SQUARE
def rf_wfs(x_train, x_test, y_train, y_test):
    # RANDOM FOREST WITH FEATURE SELECTION
    # Feature Extraction
    x_train, x_test, feature_names = bow(x_train, x_test)
    # Feature Selection
    # Select best 50 features according to chi square score
    x_train_features, x_test_features, feature_indices, selected_features = ch2(x_train, y_train, x_test, feature_names)
    # TFIDF - Vecctorization
    train_tfidf, test_tfidf = tfidf(x_train_features, x_test_features)
    # Model - Random Forest Classifier
    print("Results of Random Forest Classifier with chi2 Feature Selection")
    print("Features:\n", selected_features)
    expectedrf_wfs, predictedrf_wfs = randomforest(train_tfidf, y_train, test_tfidf, y_test)
    return expectedrf_wfs, predictedrf_wfs

# RANDOM FOREST CLASSIFIER WITH FEATURE SELECTION -- MUTUAL INFO
def rf_wfs2(x_train, x_test, y_train, y_test):
    # RANDOM FOREST WITH FEATURE SELECTION
    # Feature Extraction
    x_train, x_test, feature_names = bow(x_train, x_test)
    # Feature Selection
    # Select best 50 features according to chi square score
    x_train_features, x_test_features, feature_indices, selected_features = mutual_info(x_train, y_train, x_test, feature_names)
    # TFIDF - Vecctorization
    train_tfidf, test_tfidf = tfidf(x_train_features, x_test_features)
    # Model - Random Forest Classifier
    print("Results of Random Forest Classifier with chi2 Feature Selection")
    print("Features:\n", selected_features)
    expectedrf_wfs, predictedrf_wfs = randomforest(train_tfidf, y_train, test_tfidf, y_test)
    return expectedrf_wfs, predictedrf_wfs

# RANDOM FOREST WITHOUT FEATURE SELECTION
def rf_nfs(x_train, x_test, y_train, y_test):
    # RANDOM FOREST WITHOUT FEATURE SELECTION
    x_train_features, x_test_features, feature_names = nofeatureselection(x_train, x_test, y_train)
    # Model - RANDOM FOREST
    print("Results of SVM Classifier without Feature Selection")
    print("Features:\n", feature_names)
    expectedrf_nfs, predictedrf_nfs = randomforest(x_train_features, y_train, x_test_features, y_test)
    return expectedrf_nfs, predictedrf_nfs

x = df['content']
y = df['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.80, test_size=0.20, random_state=42)


1    12207
0     3225
Name: label, dtype: int64
1    6450
0    3225
Name: label, dtype: int64


In [2]:
# MULTINOMIAL NAIVE BAYES WITH FEATURE SELECTION - CHI SQUARE
expectednb_wfs, predictednb_wfs = naivebayes_withfs(x_train, x_test, y_train, y_test)

Results of Multinomial Naive Bayes Classifier with Feature Selection
Features:
 ['air', 'amaze', 'another', 'avoid', 'awful', 'bad', 'beautiful', 'bed', 'book', 'break', 'bug', 'call', 'card', 'carpet', 'change', 'charge', 'check', 'com', 'come', 'comfortable', 'complain', 'condition', 'could', 'curtain', 'dirty', 'disappoint', 'disgust', 'door', 'email', 'even', 'excellent', 'filthy', 'first', 'floor', 'friendly', 'give', 'great', 'guests', 'hear', 'helpful', 'horrible', 'key', 'know', 'leave', 'like', 'lock', 'look', 'lovely', 'manager', 'money', 'move', 'never', 'nice', 'night', 'noise', 'nothing', 'old', 'one', 'open', 'paper', 'pay', 'people', 'perfect', 'phone', 'picture', 'pm', 'poor', 'put', 'quiet', 'reception', 'receptionist', 'refund', 'refuse', 'restaurants', 'room', 'rude', 'seem', 'sheet', 'shower', 'sleep', 'smell', 'smoke', 'someone', 'stain', 'star', 'terrible', 'think', 'tiny', 'toilet', 'towel', 'try', 'us', 'view', 'wait', 'wall', 'water', 'window', 'work', 'worse',

In [3]:
# MULTINOMIAL NAIVE BAYES WITH FEATURE SELECTION - MUTUAL INFO
expectednb_wfs, predictednb_wfs = naivebayes_withfs2(x_train, x_test, y_train, y_test)

Results of Multinomial Naive Bayes Classifier with Feature Selection
Features:
 ['air', 'amaze', 'another', 'avoid', 'awful', 'bad', 'beautiful', 'bed', 'book', 'break', 'breakfast', 'call', 'canal', 'carpet', 'charge', 'check', 'clean', 'come', 'comfortable', 'complain', 'could', 'delicious', 'dirty', 'disgust', 'distance', 'door', 'easy', 'enjoy', 'even', 'excellent', 'fantastic', 'filthy', 'first', 'floor', 'friendly', 'give', 'great', 'helpful', 'highly', 'horrible', 'key', 'leave', 'like', 'location', 'lock', 'look', 'love', 'lovely', 'manager', 'modern', 'money', 'move', 'never', 'nice', 'night', 'nothing', 'old', 'one', 'open', 'paper', 'pay', 'people', 'perfect', 'picture', 'poor', 'put', 'quiet', 'reception', 'receptionist', 'refund', 'refuse', 'restaurants', 'room', 'rude', 'seem', 'sheet', 'shop', 'shower', 'sleep', 'smell', 'someone', 'spa', 'spacious', 'staff', 'stain', 'star', 'terrible', 'thank', 'tiny', 'toilet', 'towel', 'view', 'wait', 'walk', 'wall', 'water', 'wonder

In [4]:
# MULTIONAMIAL NAIVE BAYES WITHOUT FEATURE SELECTION
expectednb_nfs, predictednb_nfs = naivebayes_nfs(x_train, x_test, y_train, y_test)

Results of Multinomial Naive Bayes Classifier without Feature Selection
Features:
 ['air', 'amaze', 'another', 'avoid', 'awful', 'bad', 'bar', 'beautiful', 'best', 'book', 'break', 'breakfast', 'buffet', 'call', 'canal', 'carpet', 'charge', 'city', 'clean', 'comfortable', 'complain', 'curtain', 'definitely', 'dirty', 'disgust', 'distance', 'door', 'easy', 'enjoy', 'even', 'excellent', 'fantastic', 'filthy', 'food', 'friendly', 'give', 'good', 'great', 'helpful', 'highly', 'horrible', 'leave', 'locate', 'location', 'lock', 'look', 'loud', 'love', 'lovely', 'modern', 'move', 'never', 'nice', 'night', 'old', 'open', 'paper', 'pay', 'perfect', 'picture', 'poor', 'put', 'quiet', 'receptionist', 'recommend', 'refund', 'refuse', 'restaurant', 'restaurants', 'room', 'rude', 'seem', 'sheet', 'shop', 'sleep', 'smell', 'someone', 'spacious', 'staff', 'stain', 'star', 'station', 'terrible', 'thank', 'tiny', 'toilet', 'towel', 'unfriendly', 'unhelpful', 'view', 'visit', 'walk', 'wall', 'water', 'we

In [5]:
# SVM WITH FEATURE SELECTION - CHI SQUARE
expectedsvm_wfs, predictedsvm_wfs = svm_wfs(x_train, x_test, y_train, y_test)

Results of SVM with Feature Selection
Features:
 ['air', 'amaze', 'another', 'avoid', 'awful', 'bad', 'beautiful', 'bed', 'book', 'break', 'bug', 'call', 'card', 'carpet', 'change', 'charge', 'check', 'com', 'come', 'comfortable', 'complain', 'condition', 'could', 'curtain', 'dirty', 'disappoint', 'disgust', 'door', 'email', 'even', 'excellent', 'filthy', 'first', 'floor', 'friendly', 'give', 'great', 'guests', 'hear', 'helpful', 'horrible', 'key', 'know', 'leave', 'like', 'lock', 'look', 'lovely', 'manager', 'money', 'move', 'never', 'nice', 'night', 'noise', 'nothing', 'old', 'one', 'open', 'paper', 'pay', 'people', 'perfect', 'phone', 'picture', 'pm', 'poor', 'put', 'quiet', 'reception', 'receptionist', 'refund', 'refuse', 'restaurants', 'room', 'rude', 'seem', 'sheet', 'shower', 'sleep', 'smell', 'smoke', 'someone', 'stain', 'star', 'terrible', 'think', 'tiny', 'toilet', 'towel', 'try', 'us', 'view', 'wait', 'wall', 'water', 'window', 'work', 'worse', 'worst']
              precisi

In [6]:
# SVM WITH FEATURE SELECTION - MUTUAL INFO
expectedsvm_wfs, predictedsvm_wfs = svm_wfs2(x_train, x_test, y_train, y_test)

Results of SVM with Feature Selection
Features:
 ['air', 'amaze', 'another', 'avoid', 'awful', 'bad', 'beautiful', 'bed', 'book', 'break', 'breakfast', 'call', 'canal', 'carpet', 'charge', 'check', 'clean', 'come', 'comfortable', 'complain', 'could', 'delicious', 'dirty', 'disgust', 'distance', 'door', 'easy', 'enjoy', 'even', 'excellent', 'fantastic', 'filthy', 'first', 'floor', 'friendly', 'give', 'great', 'helpful', 'highly', 'horrible', 'key', 'leave', 'like', 'location', 'lock', 'look', 'love', 'lovely', 'manager', 'modern', 'money', 'move', 'never', 'nice', 'night', 'nothing', 'old', 'one', 'open', 'paper', 'pay', 'people', 'perfect', 'picture', 'poor', 'put', 'quiet', 'reception', 'receptionist', 'refund', 'refuse', 'restaurants', 'room', 'rude', 'seem', 'sheet', 'shop', 'shower', 'sleep', 'smell', 'someone', 'spa', 'spacious', 'staff', 'stain', 'star', 'terrible', 'thank', 'tiny', 'toilet', 'towel', 'view', 'wait', 'walk', 'wall', 'water', 'wonderful', 'work', 'worse', 'worst']

In [7]:
# SVM WITHOUT FEATURE SELECTION
expectedsvm_nfs, predictedsvm_nfs = svm_nfs(x_train, x_test, y_train, y_test)

Results of SVM Classifier without Feature Selection
Features:
 ['air', 'amaze', 'another', 'avoid', 'awful', 'bad', 'bar', 'beautiful', 'best', 'book', 'break', 'breakfast', 'buffet', 'call', 'canal', 'carpet', 'charge', 'city', 'clean', 'comfortable', 'complain', 'curtain', 'definitely', 'dirty', 'disgust', 'distance', 'door', 'easy', 'enjoy', 'even', 'excellent', 'fantastic', 'filthy', 'food', 'friendly', 'give', 'good', 'great', 'helpful', 'highly', 'horrible', 'leave', 'locate', 'location', 'lock', 'look', 'loud', 'love', 'lovely', 'modern', 'move', 'never', 'nice', 'night', 'old', 'open', 'paper', 'pay', 'perfect', 'picture', 'poor', 'put', 'quiet', 'receptionist', 'recommend', 'refund', 'refuse', 'restaurant', 'restaurants', 'room', 'rude', 'seem', 'sheet', 'shop', 'sleep', 'smell', 'someone', 'spacious', 'staff', 'stain', 'star', 'station', 'terrible', 'thank', 'tiny', 'toilet', 'towel', 'unfriendly', 'unhelpful', 'view', 'visit', 'walk', 'wall', 'water', 'website', 'well', 'won

In [8]:
# RANDOM FOREST CLASSIFIER WITH FEATURE SELECTION - CHI SQUARE
expectedrf_wfs, predictedrf_wfs = rf_wfs(x_train, x_test, y_train, y_test)

Results of Random Forest Classifier with chi2 Feature Selection
Features:
 ['air', 'amaze', 'another', 'avoid', 'awful', 'bad', 'beautiful', 'bed', 'book', 'break', 'bug', 'call', 'card', 'carpet', 'change', 'charge', 'check', 'com', 'come', 'comfortable', 'complain', 'condition', 'could', 'curtain', 'dirty', 'disappoint', 'disgust', 'door', 'email', 'even', 'excellent', 'filthy', 'first', 'floor', 'friendly', 'give', 'great', 'guests', 'hear', 'helpful', 'horrible', 'key', 'know', 'leave', 'like', 'lock', 'look', 'lovely', 'manager', 'money', 'move', 'never', 'nice', 'night', 'noise', 'nothing', 'old', 'one', 'open', 'paper', 'pay', 'people', 'perfect', 'phone', 'picture', 'pm', 'poor', 'put', 'quiet', 'reception', 'receptionist', 'refund', 'refuse', 'restaurants', 'room', 'rude', 'seem', 'sheet', 'shower', 'sleep', 'smell', 'smoke', 'someone', 'stain', 'star', 'terrible', 'think', 'tiny', 'toilet', 'towel', 'try', 'us', 'view', 'wait', 'wall', 'water', 'window', 'work', 'worse', 'wor

In [9]:
# RANDOM FOREST CLASSIFIER WITH FEATURE SELECTION - MUTUAL INFO
expectedrf_wfs, predictedrf_wfs = rf_wfs2(x_train, x_test, y_train, y_test)

Results of Random Forest Classifier with chi2 Feature Selection
Features:
 ['air', 'amaze', 'another', 'avoid', 'awful', 'bad', 'beautiful', 'bed', 'book', 'break', 'breakfast', 'call', 'canal', 'carpet', 'charge', 'check', 'clean', 'come', 'comfortable', 'complain', 'could', 'delicious', 'dirty', 'disgust', 'distance', 'door', 'easy', 'enjoy', 'even', 'excellent', 'fantastic', 'filthy', 'first', 'floor', 'friendly', 'give', 'great', 'helpful', 'highly', 'horrible', 'key', 'leave', 'like', 'location', 'lock', 'look', 'love', 'lovely', 'manager', 'modern', 'money', 'move', 'never', 'nice', 'night', 'nothing', 'old', 'one', 'open', 'paper', 'pay', 'people', 'perfect', 'picture', 'poor', 'put', 'quiet', 'reception', 'receptionist', 'refund', 'refuse', 'restaurants', 'room', 'rude', 'seem', 'sheet', 'shop', 'shower', 'sleep', 'smell', 'someone', 'spa', 'spacious', 'staff', 'stain', 'star', 'terrible', 'thank', 'tiny', 'toilet', 'towel', 'view', 'wait', 'walk', 'wall', 'water', 'wonderful',

In [10]:
# RANDOM FOREST CLASSIFIER WITHOUT FEATURE SELECTION 
expectedrf_nfs, predictedrf_nfs = rf_nfs(x_train, x_test, y_train, y_test)

Results of SVM Classifier without Feature Selection
Features:
 ['air', 'amaze', 'another', 'avoid', 'awful', 'bad', 'bar', 'beautiful', 'best', 'book', 'break', 'breakfast', 'buffet', 'call', 'canal', 'carpet', 'charge', 'city', 'clean', 'comfortable', 'complain', 'curtain', 'definitely', 'dirty', 'disgust', 'distance', 'door', 'easy', 'enjoy', 'even', 'excellent', 'fantastic', 'filthy', 'food', 'friendly', 'give', 'good', 'great', 'helpful', 'highly', 'horrible', 'leave', 'locate', 'location', 'lock', 'look', 'loud', 'love', 'lovely', 'modern', 'move', 'never', 'nice', 'night', 'old', 'open', 'paper', 'pay', 'perfect', 'picture', 'poor', 'put', 'quiet', 'receptionist', 'recommend', 'refund', 'refuse', 'restaurant', 'restaurants', 'room', 'rude', 'seem', 'sheet', 'shop', 'sleep', 'smell', 'someone', 'spacious', 'staff', 'stain', 'star', 'station', 'terrible', 'thank', 'tiny', 'toilet', 'towel', 'unfriendly', 'unhelpful', 'view', 'visit', 'walk', 'wall', 'water', 'website', 'well', 'won