In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
import csv
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string
import itertools

In [None]:
df = pd.read_csv('/../preprocessed.csv')
df.shape
print(df.label.value_counts())
print(df.head())

In [None]:
# Select certain number of negative and postive reviews
pos_df=df.loc[df.label=="1",:][:6450]
neg_df=df.loc[df.label=="0",:][:3225]
df=pd.concat([pos_df,neg_df],ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
df.shape
print(df.label.value_counts())

In [None]:
# Data Distribution
pie_label = ["1 (Positive)","0 (Negative)"]
pie_colors = ["yellowgreen", "lightskyblue"]
plt.pie(df.label.value_counts(), labels = pie_label, colors=pie_colors, startangle=90, autopct='%.1f%%')
plt.show()

# Feature Extraction

In [None]:
#Bag of Words
def bow(train, test):
    vectorizer = CountVectorizer(analyzer='word')
    train_data = vectorizer.fit_transform(train)
    test_data = vectorizer.transform(test)
    feature_names = vectorizer.get_feature_names()
    return train_data, test_data, feature_names

# Feature Selection

In [None]:
# Feature selection according to chi2 method
def ch2(x_train, y_train, x_test, feature_names):
    ch2 = SelectKBest(chi2, k=10)
    x_train_features = ch2.fit_transform(x_train, y_train)
    x_test_features = ch2.transform(x_test)
    feature_indices = ch2.get_support(indices=True)
    selected_features = [feature_names[i] for i in ch2.get_support(indices=True)]
    return x_train_features, x_test_features, feature_indices, selected_features

In [None]:
# Feature selection according to mutual_info_classif
def mutual_info(x_train, y_train, x_test, feature_names):
    mutual = SelectKBest(mutual_info_classif, k=10)
    x_train_features = mutual.fit_transform(x_train, y_train)
    x_test_features = mutual.transform(x_test)
    feature_indices = mutual.get_support(indices=True)
    selected_features = [feature_names[i] for i in mutual.get_support(indices=True)]
    return x_train_features, x_test_features, feature_indices, selected_features

# TF-IDF Vectorization

In [None]:
#TFIDF VECTORIZATION
def tfidf(train, test):
    vectorizer = TfidfTransformer()
    train_tfidf = vectorizer.fit_transform(train)
    test_tfidf = vectorizer.transform(test)
    return train_tfidf, test_tfidf  

In [None]:
# NO FEATURE SELECTION
def nofeatureselection(x_train, x_test, y_train):
    #TFIDF
    vectorizer = TfidfVectorizer(use_idf=True,
                                 analyzer='word',
                                 strip_accents='unicode')
    tfidf_train = vectorizer.fit_transform(x_train)
    tfidf_test = vectorizer.transform(x_test)
    feature_names = vectorizer.get_feature_names()
    ftrs = SelectKBest(K=10)
    x_train_features = ftrs.fit_transform(tfidf_train, y_train)
    x_test_features = ftrs.transform(tfidf_test)
    feature_names = [feature_names[i] for i in ftrs.get_support(indices=True)]
    return x_train_features, x_test_features, feature_names

# Machine Learning Classification Algorithms

In [None]:
#MULTINOMIAL NAIVE BAYES
def Multinomial(train_tfidf, y_train, test_tfidf, y_test):
    model = MultinomialNB()
    model.fit(train_tfidf, y_train)
    expected = y_test
    predicted = model.predict(test_tfidf)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    return expected, predicted

In [None]:
def svm(train_tfidf, y_train, test_tfidf, y_test):
    svc = SVC(kernel='linear', C = 1.0)
    svc.fit(train_tfidf, y_train)
    predicted = svc.predict(test_tfidf)
    expected = y_test
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    return expected, predicted

In [None]:
def randomforest(train_tfidf, y_train, test_tfidf, y_test):
    clf = RandomForestClassifier()
    clf.fit(train_tfidf, y_train)
    expected = y_test
    predicted = clf.predict(test_tfidf)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    return expected, predicted

In [None]:
 # MULTINOMIAL NAIVE BAYES WITH FEATURE SELECTION METHOD CHI SQUARE
def naivebayes_withfs(x_train, x_test, y_train, y_test):
    # Feature Extraction
    x_train, x_test, feature_names = bow(x_train, x_test)
    # Feature Selection
    x_train_features, x_test_features, feature_indices, selected_features = ch2(x_train, y_train, x_test, feature_names)
    # TFIDF - Vecctorization
    train_tfidf, test_tfidf = tfidf(x_train_features, x_test_features)
    # Model - Multinomial Naive Bayes
    print("Results of Multinomial Naive Bayes Classifier with Feature Selection")
    print("Features:\n", selected_features)
    expectednb_wfs, predictednb_wfs = Multinomial(train_tfidf, y_train, test_tfidf, y_test)
    return expectednb_wfs, predictednb_wfs

In [None]:
# MULTINOMIAL NAIVE BAYES WITH FEATURE SELECTION METHOD MUTUAL INFORMATION
def naivebayes_withfs2(x_train, x_test, y_train, y_test):
    # Feature Extraction
    x_train, x_test, feature_names = bow(x_train, x_test)
    # Feature Selection
    # Select best 50 features according to chi square score
    x_train_features, x_test_features, feature_indices, selected_features = mutual_info(x_train, y_train, x_test, feature_names)
    # TFIDF - Vecctorization
    train_tfidf, test_tfidf = tfidf(x_train_features, x_test_features)
    # Model - Multinomial Naive Bayes
    print("Results of Multinomial Naive Bayes Classifier with Feature Selection")
    print("Features:\n", selected_features)
    expectednb_wfs, predictednb_wfs = Multinomial(train_tfidf, y_train, test_tfidf, y_test)
    return expectednb_wfs, predictednb_wfs

In [None]:
 # MULTIONAMIAL NAIVE BAYES WITHOUT FEATURE SELECTION
def naivebayes_nfs(x_train, x_test, y_train, y_test):
    # TF-IDF Vectorization
    x_train_features, x_test_features, feature_names = nofeatureselection(x_train, x_test, y_train)
    # Model - Multinomial Naive Bayes
    print("Results of Multinomial Naive Bayes Classifier without Feature Selection")
    print("Features:\n", feature_names)
    expectednb_nfs, predictednb_nfs = Multinomial(x_train_features, y_train, x_test_features, y_test)
    return expectednb_nfs, predictednb_nfs

In [None]:
# SVM WITH FEATURE SELECTION METHOD CHI SQAURE
def svm_wfs(x_train, x_test, y_train, y_test):
    # Feature Extraction - Bag of Words
    x_train, x_test, feature_names = bow(x_train, x_test)
    # Feature Selection
    x_train_features, x_test_features, feature_indices, selected_features = ch2(x_train, y_train, x_test, feature_names)
    # TFIDF - Vecctorization
    train_tfidf, test_tfidf = tfidf(x_train_features, x_test_features)
    # Model - SVM
    print("Results of SVM with Feature Selection")
    print("Features:\n", selected_features)
    expectedsvm_wfs, predictedsvm_wfs = svm(train_tfidf, y_train, test_tfidf, y_test)
    return expectedsvm_wfs, predictedsvm_wfs

In [None]:
# SVM WITH FEATURE SELECTION METHOD MUTUAL INFORMATION
def svm_wfs2(x_train, x_test, y_train, y_test):
    # Feature Extraction
    x_train, x_test, feature_names = bow(x_train, x_test)
    # Feature Selection
    x_train_features, x_test_features, feature_indices, selected_features = mutual_info(x_train, y_train, x_test, feature_names)
    # TFIDF - Vectorization
    train_tfidf, test_tfidf = tfidf(x_train_features, x_test_features)
    # Model - SVM
    print("Results of SVM with Feature Selection")
    print("Features:\n", selected_features)
    expectedsvm_wfs, predictedsvm_wfs = svm(train_tfidf, y_train, test_tfidf, y_test)
    return expectedsvm_wfs, predictedsvm_wfs

In [None]:
# SVM WITHOUT FEATURE SELECTION
def svm_nfs(x_train, x_test, y_train, y_test):
    # TFIDF - Vecctorization
    x_train_features, x_test_features, feature_names = nofeatureselection(x_train, x_test, y_train)
    # Model - SVM
    print("Results of SVM Classifier without Feature Selection")
    print("Features:\n", feature_names)
    expectedsvm_nfs, predictedsvm_nfs = svm(x_train_features, y_train, x_test_features, y_test)
    return expectedsvm_nfs, predictedsvm_nfs

In [None]:
# RANDOM FOREST CLASSIFIER WITH FEATURE SELECTION METHOD CHI SQUARE
def rf_wfs(x_train, x_test, y_train, y_test):
    # Feature Extraction
    x_train, x_test, feature_names = bow(x_train, x_test)
    # Feature Selection
    x_train_features, x_test_features, feature_indices, selected_features = ch2(x_train, y_train, x_test, feature_names)
    # TFIDF - Vectorization
    train_tfidf, test_tfidf = tfidf(x_train_features, x_test_features)
    # Model - Random Forest Classifier
    print("Results of Random Forest Classifier with chi2 Feature Selection")
    print("Features:\n", selected_features)
    expectedrf_wfs, predictedrf_wfs = randomforest(train_tfidf, y_train, test_tfidf, y_test)
    return expectedrf_wfs, predictedrf_wfs

In [None]:
# RANDOM FOREST CLASSIFIER WITH FEATURE SELECTION METHOD MUTUAL INFORMATION
def rf_wfs2(x_train, x_test, y_train, y_test):
    # Feature Extraction
    x_train, x_test, feature_names = bow(x_train, x_test)
    # Feature Selection
    x_train_features, x_test_features, feature_indices, selected_features = mutual_info(x_train, y_train, x_test, feature_names)
    # TFIDF - Vectorization
    train_tfidf, test_tfidf = tfidf(x_train_features, x_test_features)
    # Model - Random Forest Classifier
    print("Results of Random Forest Classifier with chi2 Feature Selection")
    print("Features:\n", selected_features)
    expectedrf_wfs, predictedrf_wfs = randomforest(train_tfidf, y_train, test_tfidf, y_test)
    return expectedrf_wfs, predictedrf_wfs

In [None]:
# RANDOM FOREST WITHOUT FEATURE SELECTION
def rf_nfs(x_train, x_test, y_train, y_test):
    # TFIDF - Vectorization
    x_train_features, x_test_features, feature_names = nofeatureselection(x_train, x_test, y_train)
    # Model - RANDOM FOREST
    print("Results of SVM Classifier without Feature Selection")
    print("Features:\n", feature_names)
    expectedrf_nfs, predictedrf_nfs = randomforest(x_train_features, y_train, x_test_features, y_test)
    return expectedrf_nfs, predictedrf_nfs

In [None]:
x = df['content']
y = df['label']
# Split dataset to training set and testing set
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.80, test_size=0.20, random_state=42)
print(df.head())

MULTINOMIAL NAIVE BAYES WITH FEATURE SELECTION METHOD CHI SQUARE

In [None]:
# MULTINOMIAL NAIVE BAYES WITH FEATURE SELECTION METHOD CHI SQUARE
expectednb_wfs, predictednb_wfs = naivebayes_withfs(x_train, x_test, y_train, y_test)

MULTINOMIAL NAIVE BAYES WITH FEATURE SELECTION METHOD MUTUAL INFO

In [None]:
# MULTINOMIAL NAIVE BAYES WITH FEATURE SELECTION - MUTUAL INFO
expectednb_wfs2, predictednb_wfs2 = naivebayes_withfs2(x_train, x_test, y_train, y_test)

MULTIONAMIAL NAIVE BAYES WITHOUT FEATURE SELECTION

In [None]:
# MULTIONAMIAL NAIVE BAYES WITHOUT FEATURE SELECTION
expectednb_nfs, predictednb_nfs = naivebayes_nfs(x_train, x_test, y_train, y_test)

SVM WITH FEATURE SELECTION METHOD CHI SQUARE

In [None]:
# SVM WITH FEATURE SELECTION METHOD CHI SQUARE
expectedsvm_wfs, predictedsvm_wfs = svm_wfs(x_train, x_test, y_train, y_test)

SVM WITH FEATURE SELECTION METHOD MUTUAL INFORMATION

In [None]:
# SVM WITH FEATURE SELECTION METHOD MUTUAL INFORMATION
expectedsvm_wfs2, predictedsvm_wfs2 = svm_wfs2(x_train, x_test, y_train, y_test)

SVM WITHOUT FEATURE SELECTION

In [None]:
# SVM WITHOUT FEATURE SELECTION
expectedsvm_nfs, predictedsvm_nfs = svm_nfs(x_train, x_test, y_train, y_test)

RANDOM FOREST CLASSIFIER WITH FEATURE SELECTION METHOD CHI SQUARE

In [None]:
# RANDOM FOREST CLASSIFIER WITH FEATURE SELECTION METHOD CHI SQUARE
expectedrf_wfs, predictedrf_wfs = rf_wfs(x_train, x_test, y_train, y_test)

RANDOM FOREST CLASSIFIER WITH FEATURE SELECTION METHOD MUTUAL INFORMATION

In [None]:
# RANDOM FOREST CLASSIFIER WITH FEATURE SELECTION METHOD MUTUAL INFORMATION
expectedrf_wfs2, predictedrf_wfs2 = rf_wfs2(x_train, x_test, y_train, y_test)

RANDOM FOREST CLASSIFIER WITHOUT FEATURE SELECTION

In [None]:
# RANDOM FOREST CLASSIFIER WITHOUT FEATURE SELECTION
expectedrf_nfs, predictedrf_nfs = rf_nfs(x_train, x_test, y_train, y_test)