In [1]:
import sklearn
import os 
import pandas as pd
import numpy as np
from statistics import mean

**IMPORTING FILES**

In [2]:
#reads files and creates list
def filereader(path):
    list_of_comments = []
    all_files = os.listdir(path)
    for folder in all_files:
        files = os.listdir(path+folder)
        for file in files:
            pathname = path+folder+'\\'+file
            file = open(pathname, 'r')
            list_of_comments.append(file.readline())
    return list_of_comments

In [21]:
# Ott et. al (2011) - MTurk Hotel dataset
neg_pol_truthful = filereader("C:\\Users\\alexa\\Desktop\\op_spam_dataset\\negative_polarity\\truthful_from_Web\\")
neg_pol_deceptive = filereader("C:\\Users\\alexa\\Desktop\\op_spam_dataset\\negative_polarity\\deceptive_from_MTurk\\")
pos_pol_truthful = filereader("C:\\Users\\alexa\\Desktop\\op_spam_dataset\\positive_polarity\\truthful_from_TripAdvisor\\")
pos_pol_deceptive = filereader("C:\\Users\\alexa\\Desktop\\op_spam_dataset\\positive_polarity\\deceptive_from_MTurk\\")

In [22]:
newlist = []
for i in neg_pol_truthful:
    newlist += [len(i)]
from statistics import stdev, mean

print(mean(newlist))
print(stdev(newlist))

964.32
543.616685129216


In [3]:
# Mukherjee et. al (2013) - Yelp Hotel dataset
df = pd.read_csv(r'C:\Users\alexa\Desktop\YelpReviews.csv')
df = df.sample(frac=1).reset_index(drop=True) #shuffle and reset index

#creating deceptive lists
deceptive = df[df['flagged']=='Y']
pos_dec = deceptive[deceptive['rating'] > 2]
neg_dec = deceptive[deceptive['rating'] < 3] 
pos_pol_deceptive = [str(review) for review in pos_dec['reviewContent']]
pos_pol_deceptive = [review.replace(u'\xa0', u' ') for review in pos_pol_deceptive]
neg_pol_deceptive = [str(review) for review in neg_dec['reviewContent']]
neg_pol_deceptive = [review.replace(u'\xa0', u' ') for review in neg_pol_deceptive]

# creating truthful lists
truthful = df[df['flagged']=='N'][:780] #shuffled before, so random selection for balanced dataset
pos_tru = truthful[truthful['rating'] > 2]
neg_tru = truthful[truthful['rating'] < 3]
pos_pol_truthful = [str(review) for review in pos_tru['reviewContent']]
pos_pol_truthful = [review.replace(u'\xa0', u' ') for review in pos_pol_truthful]
neg_pol_truthful = [str(review) for review in neg_tru['reviewContent']]
neg_pol_truthful = [review.replace(u'\xa0', u' ') for review in neg_pol_truthful]

In [9]:
print(len(neg_pol_deceptive))
print(len(neg_pol_truthful))
print(len(pos_pol_deceptive))
print(len(pos_pol_truthful))

311
271
489
529


In [8]:
# Rayana (2015) - Yelp Restaurant dataset

#importing content
file = open("reviewContent", "rb")
content = file.readlines()
newlist = []
for file in content[:5000]:
    decoded = file.decode("utf-8")
    splitted = decoded.split('\t')
    splitted[3].replace(u'\xa0', u' ')
    newlist += [splitted]
    
#importing labels
file2 = open("metadata", "rb")
content2 = file2.readlines()
newlist2 = []
for file in content2[:5000]:
    decoded = file.decode("utf-8")
    splitted = decoded.split('\t')
    newlist2 += [splitted]

#creating dataframe
df = pd.DataFrame(newlist)
df = df.drop([0,1,2], 1)
df.columns = ['content']

df2 = pd.DataFrame(newlist2)
df2 = df2.drop([0,1,4], 1)
df2.columns = ['rating', 'label']
df = df.join(df2)
df = df.sample(frac=1).reset_index(drop=True) #shuffle and reset index
df = df.astype({"rating": float})

#creating deceptive lists
deceptive = df.loc[df['label'] == '-1'][:800] #shuffles before, so random
pos_dec = deceptive[deceptive['rating'] > 2]
neg_dec = deceptive[deceptive['rating'] < 3] 
pos_pol_deceptive = [str(review) for review in pos_dec['content']]
neg_pol_deceptive = [str(review) for review in neg_dec['content']]

# creating truthful lists
truthful = df.loc[df['label'] == '1'][:800]
pos_tru = truthful[truthful['rating'] > 2]
neg_tru = truthful[truthful['rating'] < 3]
pos_pol_truthful = [str(review) for review in pos_tru['content']]
neg_pol_truthful = [str(review) for review in neg_tru['content']]

**PREPROCESSING**

In [221]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from random import shuffle

In [222]:
def preprocessing(text, pos_tags=False):
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]   

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three characters
    tokens = [word for word in tokens if len(word) >= 3]

    # remove capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatizing
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]

    # includes POS_tags
    if pos_tags:
        tokens = nltk.pos_tag(tokens)
        tokens = ['_'.join(t) for t in tokens]
    
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text 

def preprocesser(list_of_texts): # turning lists into preprocessed lists
    new = []
    for item in list_of_texts:
        new += [preprocessing(item)]
    return new

In [223]:
# Creating X1
X1_neg_truthful = [r for r in preprocesser(neg_pol_truthful)]
X1_neg_deceptive = [r for r in preprocesser(neg_pol_deceptive)]
X1_pos_truthful = [r for r in preprocesser(pos_pol_truthful)]
X1_pos_deceptive = [r for r in preprocesser(pos_pol_deceptive)]

In [224]:
print(len(X1_neg_truthful))
print(len(X1_pos_truthful))
print(len(X1_neg_deceptive))
print(len(X1_pos_deceptive))

243
557
319
481


**CREATING X2**

In [225]:
negpoltruth = [(1,item) for item in X1_neg_truthful]
pospoltruth = [(0,item) for item in X1_pos_truthful]
negpoldecep = [(1,item) for item in X1_neg_deceptive]
pospoldecep = [(0,item) for item in X1_pos_deceptive]

In [226]:
all_files_labeled = negpoltruth + pospoltruth + negpoldecep + pospoldecep
all_files = neg_pol_truthful + pos_pol_truthful + neg_pol_deceptive + pos_pol_deceptive

In [227]:
polarity_values = [item[0] for item in all_files_labeled]

In [228]:
no_words = [len(item.split()) for item in all_files]

In [229]:
no_unique_words = [len(set(item.split())) for item in all_files]

In [230]:
no_sentences = [len(item.split('. ')) for item in all_files]

In [231]:
avg_no_words_per_sentence = []
for item in all_files:
    temp = []
    sentences = item.split('. ')
    for item in sentences:
        temp += [len(item.split())]
    avg_no_words_per_sentence += [mean(temp)]

In [232]:
no_digits_per_word = []
for item in all_files:
    temp = []
    words = item.split()
    for item in words:
        temp += [len(item)]
    no_digits_per_word += [mean(temp)]

In [233]:
DFX2 = pd.DataFrame(
    {'polarity': polarity_values,
     'no_words': no_words,
     'no_unique_words': no_unique_words,
     'no_sentences': no_sentences,
     'avg_no_words_per_sentence': avg_no_words_per_sentence,
     'no_digits_per_word': no_digits_per_word
    })
DFX2.head()

Unnamed: 0,polarity,no_words,no_unique_words,no_sentences,avg_no_words_per_sentence,no_digits_per_word
0,1,173,116,9,19.222222,4.416185
1,1,39,37,3,13.0,5.25641
2,1,188,126,17,11.058824,4.239362
3,1,69,56,3,23.0,4.768116
4,1,242,164,20,12.1,4.541322


In [234]:
#creating X2
X2 = DFX2.to_numpy()

# X2_neg_truthful = X2[:400]     # DATASET 1
# X2_pos_truthful = X2[400:800]
# X2_neg_deceptive = X2[800:1200]
# X2_pos_deceptive = X2[1200:]

# X2_neg_truthful = X2[:160]        # DATASET 2
# X2_pol_truthful = X2[160:780]
# X2_neg_deceptive = X2[780:1064]
# X2_pos_deceptive = X2[1064:]

X2_neg_truthful = X2[:len(X1_neg_truthful)]        # all datasets
X2_pos_truthful = X2[len(X1_neg_truthful):(len(X1_neg_truthful) + len(X1_pos_truthful))]
X2_neg_deceptive = X2[(len(X1_neg_truthful) + len(X1_pos_truthful)):(len(X1_neg_truthful) + len(X1_pos_truthful) + len(X1_neg_deceptive))]
X2_pos_deceptive = X2[(len(X1_neg_truthful) + len(X1_pos_truthful) + len(X1_neg_deceptive)):]


In [235]:
# combining X1 and X2 and labelling data: 0 for truthful, 1 for deceptive
corpus_neg_truthful = [(X1_neg_truthful[i],X2_neg_truthful[i],0) for i in range(len(X1_neg_truthful))]
corpus_neg_deceptive = [(X1_neg_deceptive[i],X2_neg_deceptive[i],1) for i in range(len(X1_neg_deceptive))]
corpus_pos_truthful = [(X1_pos_truthful[i],X2_pos_truthful[i],0) for i in range(len(X1_pos_truthful))]
corpus_pos_deceptive = [(X1_pos_deceptive[i],X2_pos_deceptive[i],1) for i in range(len(X1_pos_deceptive))]

In [236]:
corpus = corpus_neg_truthful+corpus_neg_deceptive+corpus_pos_truthful+corpus_pos_deceptive

**CO-TRAINING**

In [237]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from co_training.sklearn_cotraining.classifiers import *
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from statistics import mean, stdev
from datetime import datetime

In [238]:
# shuffle data and unpack X1 and X2
from random import shuffle
shuffle(corpus)
X1 = [x1 for x1,x2,l in corpus]
X2 = [x2 for x1,x2,l in corpus]
X2 = [l.tolist() for l in X2]
X2 = np.array(X2) #formatting
labels_y = [l for x1,x2,l in corpus]

In [241]:
def main_cotraining_CV(ratio, vectorizertype, ngram, nsplits, X1=X1):
   
    """
    Main function with cross-validation included
    Parameters:
        ratio (number of labels to remove) = float (0.80 / 0.90 / 0.95)
        vectorizertype (word counts or TF-IDF values) = CountVectorizer / TfidfVectorizer
        ngram (word pairs) = int (1=unigram, 2=bigram)
        nsplits (cross validation splits) = int (number of splits)
    
    """
    # initialize results dictionary 
    CT_SVM_results = {'accuracy':[], '1':{'precision':[], 'recall':[], 'f1-score':[]}}
    CT_NB_results = {'accuracy':[], '1':{'precision':[], 'recall':[], 'f1-score':[]}}    
    CT_RF_results = {'accuracy':[],'1':{'precision':[], 'recall':[], 'f1-score':[]}}

    # creating dataframe 
    vectorizer = vectorizertype(min_df=2, max_df=0.5, ngram_range=(1,ngram))
    X = vectorizer.fit_transform(X1)
    DF = pd.DataFrame(X.todense(),columns=vectorizer.get_feature_names())
    DF['Y'] = labels_y
    X = DF.drop('Y', axis=1)
    y = DF['Y']

    # feature selection
    def chi_square(X, y): 
        F, pval = chi2(X, y) 
        return F
    def feature_ranking(F):
        idx = np.argsort(F)
        return idx[::-1]
    df = pd.DataFrame(feature_ranking(chi_square(X,y)))
    df.columns = ['column_numbers']
    clms_toselect = df['column_numbers'].head(1000)
    a = [DF.columns[i] for i in clms_toselect]
    X1 = DF[a]
    X1 = X1.to_numpy()
    
    # random splits X1a and X1b
    np.random.shuffle(np.transpose(X1))
    X1_split = np.hsplit(X1, 2)
    X1a = X1_split[0]
    X1b = X1_split[1]

    # initialize base classifiers 
    if ratio == 0.95:
        SVM = SVC(probability=True, kernel='rbf', gamma='scale', C=4)
    elif ratio == 0.90:
        SVM = SVC(probability=True, kernel='rbf', gamma='scale', C=9)
    elif ratio == 0.80:
        SVM = SVC(probability=True, kernel='rbf', gamma='scale', C=4)
    else:
        SVM = SVC(probability=True, kernel='rbf', gamma='scale', C=9)
    
    if ratio == 0.95:
        NB = MultinomialNB(alpha=0.2)
    elif ratio == 0.90:
        NB = MultinomialNB(alpha=0.7)
    elif ratio == 0.80:
        NB = MultinomialNB(alpha=0.4)
    else:
        NB = MultinomialNB(alpha=0.7)
        
        
    if ratio == 0.95:
        RF = RandomForestClassifier(n_estimators=150, max_depth=120, min_samples_split=30)
    elif ratio == 0.90:
        RF = RandomForestClassifier(n_estimators=150, max_depth=90, min_samples_split=40)
    elif ratio == 0.80:
        RF = RandomForestClassifier(n_estimators=150, max_depth=140, min_samples_split=30)
    else:
        RF = RandomForestClassifier(n_estimators=150, max_depth=30, min_samples_split=20)   
        
    # KFold crossvalidation
    kf = KFold(n_splits=nsplits, shuffle=False)
    for train_index, test_index, in kf.split(X1):
        X1_train, X1_test = X1[train_index], X1[test_index]     # pick X1a or X1
        X2_train, X2_test = X2[train_index], X2[test_index]     # pick X1b or X2
        y_train, y_test = y[train_index], y[test_index]

        # removing labels    
        rng = np.random.RandomState(42) 
        random_unlabeled_points = rng.rand(len(y_train)) < ratio
        labels = np.copy(y_train)
        labels[random_unlabeled_points] = -1
        y_train = labels

        # train models
        cotraining_model_SVM = CoTrainingClassifier(SVM)
        cotraining_model_SVM.fit(X1_train, X2_train, y_train)

        cotraining_model_NB = CoTrainingClassifier(NB)
        cotraining_model_NB.fit(X1_train, X2_train, y_train)

        cotraining_model_RF = CoTrainingClassifier(RF)
        cotraining_model_RF.fit(X1_train, X2_train, y_train)

        # evaluate models
        y_pred_SVM = cotraining_model_SVM.predict(X1_test, X2_test)
        y_pred_NB = cotraining_model_NB.predict(X1_test, X2_test)
        y_pred_RF = cotraining_model_RF.predict(X1_test, X2_test)
        
        report = classification_report(y_test,y_pred_SVM, output_dict=True)
        CT_SVM_results['accuracy'] += [report['accuracy']] 
        CT_SVM_results['1']['precision'] += [report['1']['precision']]
        CT_SVM_results['1']['recall'] += [report['1']['recall']]
        CT_SVM_results['1']['f1-score'] += [report['1']['f1-score']]
   
        report = classification_report(y_test,y_pred_NB, output_dict=True)      
        CT_NB_results['accuracy'] += [report['accuracy']]
        CT_NB_results['1']['precision'] += [report['1']['precision']]
        CT_NB_results['1']['recall'] += [report['1']['recall']]
        CT_NB_results['1']['f1-score'] += [report['1']['f1-score']]

        report = classification_report(y_test,y_pred_RF, output_dict=True)   
        CT_RF_results['accuracy'] += [report['accuracy']] 
        CT_RF_results['1']['precision'] += [report['1']['precision']]
        CT_RF_results['1']['recall'] += [report['1']['recall']]
        CT_RF_results['1']['f1-score'] += [report['1']['f1-score']]

    with open('Co_Training_results_dataset3.txt', 'a') as file:
        file.write('___ratio:' + str(ratio) + '|ngram:' + str(ngram) + '|vectorizer:'+ str(vectorizertype) + '___\n')
        file.write('SVM:\n')
        file.write('accuracy: ' + str(round(mean(CT_SVM_results['accuracy']),2)) + '\n')
        file.write('stddev: ' + str(stdev(CT_SVM_results['accuracy'])) + '\n')
        file.write('precision: ' + str(round(mean(CT_SVM_results['1']['precision']),2)) + '\n')
        file.write('recall: ' + str(round(mean(CT_SVM_results['1']['recall']),2)) + '\n')
        file.write('F1 :' + str(round(mean(CT_SVM_results['1']['f1-score']),2)) + '\n\n')

        file.write('NB:\n')        
        file.write('accuracy: ' + str(round(mean(CT_NB_results['accuracy']),2)) + '\n')
        file.write('stddev: ' + str(stdev(CT_NB_results['accuracy'])) + '\n')
        file.write('precision: ' + str(round(mean(CT_NB_results['1']['precision']),2)) + '\n')
        file.write('recall: ' + str(round(mean(CT_NB_results['1']['recall']),2)) + '\n')
        file.write('F1 :' + str(round(mean(CT_NB_results['1']['f1-score']),2)) + '\n\n')
        
        file.write('RF:\n')        
        file.write('accuracy: ' + str(round(mean(CT_RF_results['accuracy']),2)) + '\n')
        file.write('stddev: ' + str(stdev(CT_RF_results['accuracy'])) + '\n')
        file.write('precision: ' + str(round(mean(CT_RF_results['1']['precision']),2)) + '\n')
        file.write('recall: ' + str(round(mean(CT_RF_results['1']['recall']),2)) + '\n')
        file.write('F1 :' + str(round(mean(CT_RF_results['1']['f1-score']),2)) + '\n\n')
        obj = datetime.now()
        file.write(str(obj) + '\n\n\n')

In [242]:
main_cotraining_CV(ratio=0.80, vectorizertype=CountVectorizer, ngram=2, nsplits=5)
print(4)
main_cotraining_CV(ratio=0.90, vectorizertype=CountVectorizer, ngram=2, nsplits=5)
print(5)
main_cotraining_CV(ratio=0.95, vectorizertype=CountVectorizer, ngram=2, nsplits=5)
print(6)
main_cotraining_CV(ratio=0.80, vectorizertype=TfidfVectorizer, ngram=2, nsplits=5)
print(7)
main_cotraining_CV(ratio=0.90, vectorizertype=TfidfVectorizer, ngram=2, nsplits=5)
print(8)
main_cotraining_CV(ratio=0.95, vectorizertype=TfidfVectorizer, ngram=2, nsplits=5)
print(9)

4
5
6
7
8
9


In [240]:
main_cotraining_CV(ratio=0.80, vectorizertype=CountVectorizer, ngram=2, nsplits=5)
print(4)
main_cotraining_CV(ratio=0.90, vectorizertype=CountVectorizer, ngram=2, nsplits=5)
print(5)
main_cotraining_CV(ratio=0.95, vectorizertype=CountVectorizer, ngram=2, nsplits=5)
print(6)

4
5
6
