In [1]:
import sklearn
import os 
import pandas as pd
import numpy as np

**IMPORTING FILES**

In [2]:
#reads files and creates list
def filereader(path):
    list_of_comments = []
    all_files = os.listdir(path)
    for folder in all_files:
        files = os.listdir(path+folder)
        for file in files:
            pathname = path+folder+'\\'+file
            file = open(pathname, 'r')
            list_of_comments.append(file.readline())
    return list_of_comments

In [4]:
# Ott et. al (2011) - MTurk Hotel dataset
neg_pol_truthful = filereader("C:\\Users\\alexa\\Desktop\\op_spam_dataset\\negative_polarity\\truthful_from_Web\\")
neg_pol_deceptive = filereader("C:\\Users\\alexa\\Desktop\\op_spam_dataset\\negative_polarity\\deceptive_from_MTurk\\")
pos_pol_truthful = filereader("C:\\Users\\alexa\\Desktop\\op_spam_dataset\\positive_polarity\\truthful_from_TripAdvisor\\")
pos_pol_deceptive = filereader("C:\\Users\\alexa\\Desktop\\op_spam_dataset\\positive_polarity\\deceptive_from_MTurk\\")

In [3]:
# Mukherjee et. al (2013) - Yelp Hotel dataset
df = pd.read_csv(r'C:\Users\alexa\Desktop\YelpReviews.csv')
df = df.sample(frac=1).reset_index(drop=True) #shuffle and reset index

#creating deceptive lists
deceptive = df[df['flagged']=='Y']
pos_dec = deceptive[deceptive['rating'] > 2]
neg_dec = deceptive[deceptive['rating'] < 3] 
pos_pol_deceptive = [str(review) for review in pos_dec['reviewContent']]
pos_pol_deceptive = [review.replace(u'\xa0', u' ') for review in pos_pol_deceptive]
neg_pol_deceptive = [str(review) for review in neg_dec['reviewContent']]
neg_pol_deceptive = [review.replace(u'\xa0', u' ') for review in neg_pol_deceptive]

# creating truthful lists
truthful = df[df['flagged']=='N'][:780] #shuffled before, so random selection for balanced dataset
pos_tru = truthful[truthful['rating'] > 2]
neg_tru = truthful[truthful['rating'] < 3]
pos_pol_truthful = [str(review) for review in pos_tru['reviewContent']]
pos_pol_truthful = [review.replace(u'\xa0', u' ') for review in pos_pol_truthful]
neg_pol_truthful = [str(review) for review in neg_tru['reviewContent']]
neg_pol_truthful = [review.replace(u'\xa0', u' ') for review in neg_pol_truthful]

In [11]:
# Rayana (2015) - Yelp Restaurant dataset

#importing content
file = open("reviewContent", "rb")
content = file.readlines()
newlist = []
for file in content[:5000]:
    decoded = file.decode("utf-8")
    splitted = decoded.split('\t')
    splitted[3].replace(u'\xa0', u' ')
    newlist += [splitted]
    
#importing labels
file2 = open("metadata", "rb")
content2 = file2.readlines()
newlist2 = []
for file in content2[:5000]:
    decoded = file.decode("utf-8")
    splitted = decoded.split('\t')
    newlist2 += [splitted]

#creating dataframe
df = pd.DataFrame(newlist)
df = df.drop([0,1,2], 1)
df.columns = ['content']

df2 = pd.DataFrame(newlist2)
df2 = df2.drop([0,1,4], 1)
df2.columns = ['rating', 'label']
df = df.join(df2)
df = df.sample(frac=1).reset_index(drop=True) #shuffle and reset index
df = df.astype({"rating": float})

#creating deceptive lists
deceptive = df.loc[df['label'] == '-1'][:800] #shuffles before, so random
pos_dec = deceptive[deceptive['rating'] > 2]
neg_dec = deceptive[deceptive['rating'] < 3] 
pos_pol_deceptive = [str(review) for review in pos_dec['content']]
neg_pol_deceptive = [str(review) for review in neg_dec['content']]

# creating truthful lists
truthful = df.loc[df['label'] == '1'][:800]
pos_tru = truthful[truthful['rating'] > 2]
neg_tru = truthful[truthful['rating'] < 3]
pos_pol_truthful = [str(review) for review in pos_tru['content']]
neg_pol_truthful = [str(review) for review in neg_tru['content']]

**PREPROCESSING**

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from random import shuffle

In [13]:
def preprocessing(text, pos_tags=False):
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]   

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three characters
    tokens = [word for word in tokens if len(word) >= 3]

    # remove capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatizing
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]

    # includes POS_tags
    if pos_tags:
        tokens = nltk.pos_tag(tokens)
        tokens = ['_'.join(t) for t in tokens]
    
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text 

def preprocesser(list_of_texts): # turning lists into preprocessed lists
    new = []
    for item in list_of_texts:
        new += [preprocessing(item)]
    return new

In [14]:
# labelling data: 0 for truthful, 1 for deceptive
corpus_neg_truthful = [(r,0) for r in preprocesser(neg_pol_truthful)]
corpus_neg_deceptive = [(r,1) for r in preprocesser(neg_pol_deceptive)]
corpus_pos_truthful = [(r,0) for r in preprocesser(pos_pol_truthful)]
corpus_pos_deceptive = [(r,1) for r in preprocesser(pos_pol_deceptive)]

In [15]:
# shuffle data instances
from random import shuffle
corpus = corpus_neg_truthful+corpus_neg_deceptive+corpus_pos_truthful+corpus_pos_deceptive
shuffle(corpus)
reviews = [r for r,l in corpus]
labels_y = [l for r,l in corpus]

**LABEL PROPAGATION**

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from statistics import mean, stdev
from datetime import datetime

In [17]:
def main_labelX_CV(ratio, vectorizertype, ngram, nsplits):
   
    """
    Main function with cross-validation included
    Parameters:
        ratio (number of labels to remove) = float (0.80 / 0.90 / 0.95)
        vectorizertype (word counts or TF-IDF values) = CountVectorizer / TfidfVectorizer
        ngram (word pairs) = int (1=unigram, 2=bigram)
        nsplits (cross validation splits) = int (number of splits)
    
    """
    # initialize results dictionary     
    LP_results = {'accuracy':[], '1':{'precision':[], 'recall':[], 'f1-score':[]}}
    LS_results = {'accuracy':[], '1':{'precision':[], 'recall':[], 'f1-score':[]}}
    
    
    # creating dataframe 
    vectorizer = vectorizertype(min_df=2, max_df=0.5, ngram_range=(1,ngram))
    X = vectorizer.fit_transform(reviews)
    DF = pd.DataFrame(X.todense(),columns=vectorizer.get_feature_names())
    DF['Y'] = labels_y
    X = DF.drop('Y', axis=1)
    y = DF['Y']
    
    # feature selection
    def chi_square(X, y): 
        F, pval = chi2(X, y) 
        return F
    def feature_ranking(F):
        idx = np.argsort(F)
        return idx[::-1]
    df = pd.DataFrame(feature_ranking(chi_square(X,y)))
    df.columns = ['column_numbers']
    clms_toselect = df['column_numbers'].head(1000)
    a = [DF.columns[i] for i in clms_toselect]
    X = DF[a]
    X = X.to_numpy()
    
    # KFold crossvalidation
    kf = KFold(n_splits=nsplits, shuffle=False)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # removing labels    
        rng = np.random.RandomState(42) 
        random_unlabeled_points = rng.rand(len(y_train)) < ratio # % ratio KNOB
        labels = np.copy(y_train)
        labels[random_unlabeled_points] = -1
        y_train = labels

        # train models
        if ratio == 0.95:
            label_prop_model = LabelPropagation(kernel='knn', n_neighbors=10) 
            label_spread_model = LabelSpreading(kernel='knn', n_neighbors=30) 
        elif ratio == 0.90:
            label_prop_model = LabelPropagation(kernel='knn', n_neighbors=200) 
            label_spread_model = LabelSpreading(kernel='knn', n_neighbors=200)
        elif ratio == 0.80:
            label_prop_model = LabelPropagation(kernel='knn', n_neighbors=50) 
            label_spread_model = LabelSpreading(kernel='knn', n_neighbors=150)
        else:
            label_prop_model = LabelPropagation(kernel='knn', n_neighbors=200) 
            label_spread_model = LabelSpreading(kernel='knn', n_neighbors=200)
        
                
        label_prop_model.fit(X_train, y_train)          
        label_spread_model.fit(X_train, y_train)

        # evaluate models
        y_pred_LP = label_prop_model.predict(X_test)
        y_pred_LS = label_spread_model.predict(X_test)
        
        report = classification_report(y_test,y_pred_LP, output_dict=True)
        LP_results['accuracy'] += [report['accuracy']]
        LP_results['1']['precision'] += [report['1']['precision']]
        LP_results['1']['recall'] += [report['1']['recall']]
        LP_results['1']['f1-score'] += [report['1']['f1-score']]
    
        report = classification_report(y_test,y_pred_LS, output_dict=True)     
        LS_results['accuracy'] += [report['accuracy']]
        LS_results['1']['precision'] += [report['1']['precision']]
        LS_results['1']['recall'] += [report['1']['recall']]
        LS_results['1']['f1-score'] += [report['1']['f1-score']]

    with open('Label_Prop_Spread_results_dataset3.txt', 'a') as file:
        file.write('___ratio:' + str(ratio) + '|ngram:' + str(ngram) + '|vectorizer:'+ str(vectorizertype) + '___\n')
        file.write('Label Propagation:\n')
        file.write('accuracy: ' + str(round(mean(LP_results['accuracy']),2)) + '\n')
        file.write('stddev: ' + str(stdev(LP_results['accuracy'])) + '\n')
        file.write('precision: ' + str(round(mean(LP_results['1']['precision']),2)) + '\n')
        file.write('recall: ' + str(round(mean(LP_results['1']['recall']),2)) + '\n')
        file.write('F1 :' + str(round(mean(LP_results['1']['f1-score']),2)) + '\n\n')

        file.write('Label Spreading:\n')        
        file.write('accuracy: ' + str(round(mean(LS_results['accuracy']),2)) + '\n')
        file.write('stddev: ' + str(stdev(LS_results['accuracy'])) + '\n')
        file.write('precision: ' + str(round(mean(LS_results['1']['precision']),2)) + '\n')
        file.write('recall: ' + str(round(mean(LS_results['1']['recall']),2)) + '\n')
        file.write('F1 :' + str(round(mean(LS_results['1']['f1-score']),2)) + '\n\n')
        obj = datetime.now()
        file.write(str(obj) + '\n\n\n')

In [10]:
main_labelX_CV(ratio=0.80, vectorizertype=CountVectorizer, ngram=1, nsplits=5)
print(1)
main_labelX_CV(ratio=0.90, vectorizertype=CountVectorizer, ngram=1, nsplits=5)
print(2)
main_labelX_CV(ratio=0.95, vectorizertype=CountVectorizer, ngram=1, nsplits=5)
print(3)
main_labelX_CV(ratio=0.80, vectorizertype=CountVectorizer, ngram=2, nsplits=5)
print(4)
main_labelX_CV(ratio=0.90, vectorizertype=CountVectorizer, ngram=2, nsplits=5)
print(5)
main_labelX_CV(ratio=0.95, vectorizertype=CountVectorizer, ngram=2, nsplits=5)
print(6)
main_labelX_CV(ratio=0.80, vectorizertype=TfidfVectorizer, ngram=2, nsplits=5)
print(7)
main_labelX_CV(ratio=0.90, vectorizertype=TfidfVectorizer, ngram=2, nsplits=5)
print(8)
main_labelX_CV(ratio=0.95, vectorizertype=TfidfVectorizer, ngram=2, nsplits=5)
print(9)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


2


  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))


3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


5


  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))


6


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


7


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


8


  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer


9


  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
main_labelX_CV(ratio=0.80, vectorizertype=CountVectorizer, ngram=1, nsplits=5)
print(1)
main_labelX_CV(ratio=0.90, vectorizertype=CountVectorizer, ngram=1, nsplits=5)
print(2)
main_labelX_CV(ratio=0.95, vectorizertype=CountVectorizer, ngram=1, nsplits=5)
print(3)
main_labelX_CV(ratio=0.80, vectorizertype=CountVectorizer, ngram=2, nsplits=5)
print(4)
main_labelX_CV(ratio=0.90, vectorizertype=CountVectorizer, ngram=2, nsplits=5)
print(5)
main_labelX_CV(ratio=0.95, vectorizertype=CountVectorizer, ngram=2, nsplits=5)
print(6)
main_labelX_CV(ratio=0.80, vectorizertype=TfidfVectorizer, ngram=2, nsplits=5)
print(7)
main_labelX_CV(ratio=0.90, vectorizertype=TfidfVectorizer, ngram=2, nsplits=5)
print(8)
main_labelX_CV(ratio=0.95, vectorizertype=TfidfVectorizer, ngram=2, nsplits=5)
print(9)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


2


  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  self.label_distributions_ /= normalizer
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))


3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


5


  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  self.label_distributions_ /= normalizer
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))


6


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


7


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


8


  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  self.label_distributions_ /= normalizer


9


  _warn_prf(average, modifier, msg_start, len(result))
