In [3]:
import numpy as np
import scipy.stats as stats
import csv
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import gensim as gs 
from gensim import corpora, models, similarities
import logging
import multiprocessing


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


np.random.seed(0)

# this line tells jupyter notebook to put the plots in the notebook rather than saving them to file.
%matplotlib inline

# this line makes plots prettier on mac retina screens. If you don't have one it shouldn't do anything.
%config InlineBackend.figure_format = 'retina'

cores = multiprocessing.cpu_count()

# Hotel Reviews: What is real?

<h1>Problem Statement</h1>
When someone goes on to plan a stay, they tend to use online reviews to base their decision; However, there are usually a range of opinions so how can we tell if a certain review is fake/real or just plain bad luck? Thus the problem statement is to detect fradulent reviews for people are surfing hotel booking sites. 

<h1>Data Source</h1>
1600 Labelled Records, 800 True, 800 Fradulent reviews, plain text review (some generated by MTurk) <br>
512k Unlabelled Records, with numerical Ratings, Positive and Negative labels

<h1>Prior Research</h1>
Using Bi-grams (2-word combinations), detection rate is at ~80% 


<h1>Proposed Method(s)</h1>
-Preprocess the data as according to the best practices (mentioned in Empath, Stanford, 2016) <br>
-train_test_split(train, test)We only train our model from the trainsplit corpus. <br> 
-Decide on which Vocabulary to use (Unlabelled, or with labelled data)[Unlabelled] <br>
-Decide on which train data to input into the doc2vec class (unlablled, labelled. [Unlabelled] <br>
-Create additional Text Features: <strike>TF-IDF</strike>, <i>unigram and bigram stopword removal (minor improvement)</i>, Probabilistic Context-Free Grammar (but tends to be bad with paras with many sentences), <strike>LIWC (Linguistic tagging-word count features)</strike>Can we replicate the individual features?, GloVE, Consistency,<i> Sentiment-Detection (decent results)</i>, <i>LDA, topic-modelling (marginal improvement)</i>, <i>Empath topic modelling </i><br>
-TFIDF good with LDA, LSA <br>
-Determine the classifiers to build the model on top of (Logreg, SVM,   GaussianNB , Decision Trees creation), Word Level Features <br>
-Consider PCA/K-best? <br>
-Determine which semisupervised learning algorithm to implement. there are inbuilt python classes for semisupervised learning (LabelPropagation, LabelSpreading) or self-developed label propagation functions. 
<br>
-Currently, use of other features such as social network topology, or timestamps, or rating behavior will not be examined. 

<h1>Risks and Assumptions</h1>
Risks: There are quite many NLP libraries available (NLTK, gensim, etc.) which means we have to try substanial number of libraries to assess their results.

Each run of the algorithm takes up a significant amount of time, thus more time is needed so a CUDA-specific libray (TensorFlow) could be used.However this would mean even more time spent on on-boarding. 

There is a assumption that there actually exists some pattern or trend that exists in both truthful and fake reviews. 

<h1>Specific Aim</h1>
To aim to obtain between 71% ~ 74% accuracy overall, with a good f1 score: As it is not very useful to customers if there are a lot of false positive, the noise to signal ratio would overwhelm customers who would then fail to accept the system's recommendation of fake review.

In [4]:
df = pd.read_csv('./input/deceptive-opinion.csv')

In [5]:
df.head(4)

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...


In [6]:
df['hotel'].unique()

array(['conrad', 'hyatt', 'omni', 'fairmont', 'sheraton', 'knickerbocker',
       'homewood', 'swissotel', 'ambassador', 'affinia', 'hardrock',
       'talbott', 'hilton', 'james', 'monaco', 'sofitel', 'palmer',
       'intercontinental', 'allegro', 'amalfi'], dtype=object)

In [7]:
df['source'].unique()

array(['TripAdvisor', 'MTurk', 'Web'], dtype=object)

In [8]:
# pd.DataFrame(map(len,df['text'])).describe()
pd.DataFrame([len(x) for x in df['text']]).describe()

Unnamed: 0,0
count,1600.0
mean,806.39125
std,467.260647
min,151.0
25%,487.0
50%,700.0
75%,987.5
max,4159.0


In [9]:
df['deceptive'].value_counts()

truthful     800
deceptive    800
Name: deceptive, dtype: int64

In [10]:
df.loc[1,'text']

'Triple A rate with upgrade to view room was less than $200 which also included breakfast vouchers. Had a great view of river, lake, Wrigley Bldg. & Tribune Bldg. Most major restaurants, Shopping, Sightseeing attractions within walking distance. Large room with a very comfortable bed. \n'

In [11]:
df2 = pd.read_csv('./input/Hotel_Reviews.csv')
data2 = df2[~df2['lat'].isnull()]
data2 = data2.reset_index(drop=True)

In [12]:
print(data2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512470 entries, 0 to 512469
Data columns (total 17 columns):
Hotel_Address                                 512470 non-null object
Additional_Number_of_Scoring                  512470 non-null int64
Review_Date                                   512470 non-null object
Average_Score                                 512470 non-null float64
Hotel_Name                                    512470 non-null object
Reviewer_Nationality                          512470 non-null object
Negative_Review                               512470 non-null object
Review_Total_Negative_Word_Counts             512470 non-null int64
Total_Number_of_Reviews                       512470 non-null int64
Positive_Review                               512470 non-null object
Review_Total_Positive_Word_Counts             512470 non-null int64
Total_Number_of_Reviews_Reviewer_Has_Given    512470 non-null int64
Reviewer_Score                                512470 non-null flo

In [13]:
data2.head(3).T

Unnamed: 0,0,1,2
Hotel_Address,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...
Additional_Number_of_Scoring,194,194,194
Review_Date,8/3/2017,8/3/2017,7/31/2017
Average_Score,7.7,7.7,7.7
Hotel_Name,Hotel Arena,Hotel Arena,Hotel Arena
Reviewer_Nationality,Russia,Ireland,Australia
Negative_Review,I am so angry that i made this post available...,No Negative,Rooms are nice but for elderly a bit difficul...
Review_Total_Negative_Word_Counts,397,0,42
Total_Number_of_Reviews,1403,1403,1403
Positive_Review,Only the park outside of the hotel was beauti...,No real complaints the hotel was great great ...,Location was good and staff were ok It is cut...


In [14]:
all_unlabelled = []
for i,line in enumerate(data2['Negative_Review']):
    inp = ""
    if (np.random.randint(0,2) == 0):
        inp = data2.loc[i,'Negative_Review'] + ' ' + data2.loc[i,'Positive_Review']
    else:
        inp = data2.loc[i,'Positive_Review'] + ' ' + data2.loc[i,'Negative_Review']
    all_unlabelled.append(inp)

data2['All_Text'] = pd.Series(all_unlabelled)


In [15]:
data2[['All_Text','Negative_Review','Positive_Review']].head(10)

Unnamed: 0,All_Text,Negative_Review,Positive_Review
0,I am so angry that i made this post available...,I am so angry that i made this post available...,Only the park outside of the hotel was beauti...
1,No real complaints the hotel was great great ...,No Negative,No real complaints the hotel was great great ...
2,Location was good and staff were ok It is cut...,Rooms are nice but for elderly a bit difficul...,Location was good and staff were ok It is cut...
3,My room was dirty and I was afraid to walk ba...,My room was dirty and I was afraid to walk ba...,Great location in nice surroundings the bar a...
4,Amazing location and building Romantic settin...,You When I booked with your company on line y...,Amazing location and building Romantic setting
5,Good restaurant with modern design great chil...,Backyard of the hotel is total mess shouldn t...,Good restaurant with modern design great chil...
6,The room is spacious and bright The hotel is ...,Cleaner did not change our sheet and duvet ev...,The room is spacious and bright The hotel is ...
7,Good location Set in a lovely park friendly s...,Apart from the price for the brekfast Everyth...,Good location Set in a lovely park friendly s...
8,No Positive Even though the pictures show ver...,Even though the pictures show very clean room...,No Positive
9,The room was big enough and the bed is good T...,The aircondition makes so much noise and its ...,The room was big enough and the bed is good T...


In [16]:
# create word2vec, para2vec for both labelled, nonlabelled data

In [17]:
import gensim,logging
from gensim.parsing import PorterStemmer
from gensim.models import Word2Vec, Doc2Vec, Phrases
from gensim.models.phrases import Phraser
from wikipedia import search,page
import multiprocessing
import collections
import re
import warnings

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
warnings.filterwarnings(action='ignore')

assert gensim.models.doc2vec.FAST_VERSION > -1

In [18]:
global_stemmer = PorterStemmer()

class StemmingHelper(object):
    word_lookup = {}
    
    @classmethod
    def stem(cls, word):
        stemmed = global_stemmer.stem(word)
        
        #Update word lookup
        if stemmed not in cls.word_lookup:
            cls.word_lookup[stemmed] = {}
        cls.word_lookup[stemmed][word] = (
            cls.word_lookup[stemmed].get(word,0) + 1
        )
        return stemmed
    
    @classmethod
    def original_form(cls, word):
        if word in cls.word_lookup:
            return max(cls.word_lookup[word].keys(),
                      key=lambda x: cls.word_lookup[word][x])
        else: 
            return word

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

In [19]:
# sentences = list(filter(None,re.split('[\n.]',wikipage.content)))
# # sentences = list(filter(None, sentences))
# sentences2 = [list(filter(None,re.split('[ \t,]',s))) for s in sentences]
stop_words = pd.read_csv('./input/stopwords.csv',names=['stop'])
new_stop = stop_words.stop.map(lambda x: str.capitalize(x))
all_stop_set = set(stop_words.stop.append(new_stop,ignore_index=True))
# sentences3 = [[c for c in s if c not in stopwords]  for s in sentences2]
# sentences4 = [[c for c in s]  for s in sentences2]
# for s in sentences3:
#     for c in s:
#         StemmingHelper.stem(c)

In [20]:
min_count = 2
size = 50
window = 4

In [21]:
truedf = df[df.deceptive=='truthful'].loc[:,'text']
fakedf = df[df.deceptive=='deceptive'].loc[:,'text']
truedfy = df[df.deceptive=='truthful'].loc[:,'deceptive']
fakedfy = df[df.deceptive=='deceptive'].loc[:,'deceptive']
truedfy.replace({'truthful':1},inplace=True)
fakedfy.replace({'deceptive':0},inplace=True)
#Truth = 1, Fake = 0

In [23]:
stop_words = pd.read_csv('./input/stopwords.csv',names=['stop'])
new_stop = stop_words.stop.map(lambda x: str.capitalize(x))
all_stop_set = set(stop_words.stop.append(new_stop,ignore_index=True))

In [24]:
#generate combinations of all_stop_words, bigrams
#generate bigram
bigram_stops=[]
for a in all_stop_set:
    for b in all_stop_set:
        bigram_stop = a+"_"+b
        bigram_stops.append(bigram_stop)
all_stop_set = all_stop_set.union(bigram_stops)

In [26]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

In [27]:
def read_corpus(sentdf, tokens_only=False):
    for i, line in enumerate(sentdf):
        if tokens_only:
            yield list(gensim.utils.tokenize(line))
        else:
            yield gensim.models.doc2vec.TaggedDocument(list(gensim.utils.tokenize(line)),[i])

In [None]:
true_corpus = list(read_corpus(truedf, tokens_only=True))
fake_corpus = list(read_corpus(fakedf, tokens_only=True))
unlabelled_corpus = list(read_corpus(data2['All_Text'], tokens_only=True))

# def to_list(doc):
#     return [t.text for t in doc]
# true_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(truedf)]
# fake_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(fakedf)]
# unlabelled_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(data2['All_Text'])]

In [None]:
# Save the embeddings for 

In [None]:
phrase_list = []

phrases = Phrases(unlabelled_corpus)
bigram = Phraser(phrases)
unlabelled_corpus = [bigram[s] for s in unlabelled_corpus]
true_corpus = [bigram[s] for s in true_corpus]
fake_corpus = [bigram[s] for s in fake_corpus]

phrases = Phrases(sentence_stream)
trigram = Phraser(phrases)
unlabelled_corpus = [trigram[s] for s in unlabelled_corpus]
true_corpus = [trigram[s] for s in true_corpus]
fake_corpus = [trigram[s] for s in fake_corpus]

def filter_stream(df, stopword_set):
    sentence_stream = []
    for s in df:
        s2 = [c  for c in s if c not in stopword_set] 
        sentence_stream.append(s2)
    return sentence_stream

unlabelled_corpus = filter_stream(unlabelled_corpus, all_stop_set)
true_corpus = filter_stream(true_corpus, all_stop_set)
fake_corpus = filter_stream(fake_corpus, all_stop_set)

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def gen_vader_vec(df):
    sia = SentimentIntensityAnalyzer()
    compound = []
    neg = []
    neu = []
    pos = []
    
    for ind,s in enumerate(df):
        if ind % 100 == 0 : 
            print(ind)
        sent = sia.polarity_scores(' '.join(s))
        compound.append(sent['compound'])
        neg.append(sent['neg'])
        neu.append(sent['neu'])
        pos.append(sent['pos'])
    vader_sent = pd.DataFrame({'compound':compound, 'neg':neg, 'neu':neu, 'pos':pos})
    return vader_sent

unlabelled_vader_vec = gen_vader_feat(unlabelled_corpus)
true_vader_vec = gen_vader_feat(true_corpus)
fake_vader_vec = gen_vader_feat(fake_corpus)

In [None]:
from empath import Empath
lexicon = Empath()

def gen_empath_vec(df,lex):
    lexicon_results = pd.DataFrame(columns=lex.cats)
    for ind, s in enumerate(df):
        lexicon_results = lexicon_results.append(pd.Series([np.nan]), ignore_index=True)
        results = (lex.analyze(s))
        if (ind % 100 == 0):
            print(ind)
        for k in results.keys():
            lexicon_results[k].iloc[ind] = results[k]

    lexicon_results.drop(columns=[0],inplace=True)
    return lexicon_results

unlabelled_emp_vec = gen_empath_vec(unlabelled_corpus, lexicon)
true_emp_vec = gen_empath_vec(true_corpus, lexicon)
fake_emp_vec = gen_empath_vec(fake_corpus, lexicon)

In [24]:
temp_arr = []
ctr = 0
for ind, s in enumerate(unlabelled_corpus):
    temp_arr.append(gensim.models.doc2vec.TaggedDocument(s,[ctr]))
    ctr += 1
unlabelled_corpus = temp_arr.copy()
temp_arr = []
for ind, s in enumerate(true_corpus):
    temp_arr.append(gensim.models.doc2vec.TaggedDocument(s,[ctr]))
    ctr += 1
true_corpus = temp_arr.copy()
temp_arr = []
for ind, s in enumerate(fake_corpus):
    temp_arr.append(gensim.models.doc2vec.TaggedDocument(s,[ctr]))
    ctr += 1
fake_corpus = temp_arr.copy()
del(temp_arr)

In [25]:
true_corpus

[TaggedDocument(words=['We', 'stayed', 'for', 'a', 'one', 'night', 'getaway', 'with', 'family', 'on', 'a', 'thursday', 'Triple', 'AAA', 'rate', 'of', 'was', 'a', 'steal', 'th_floor', 'room', 'complete', 'with', 'in', 'plasma', 'TV', 'bose', 'stereo', 'voss', 'and', 'evian', 'water', 'and', 'gorgeous', 'bathroom', 'no', 'tub', 'but', 'was', 'fine', 'for', 'us', 'Concierge', 'was', 'very', 'helpful', 'You_cannot', 'beat', 'this', 'location', 'Only', 'flaw', 'was', 'breakfast', 'was', 'pricey', 'and', 'service', 'was', 'very', 'very', 'slow', 'hours', 'for', 'four', 'kids', 'and', 'four_adults', 'on', 'a', 'friday', 'morning', 'even_though', 'there', 'were', 'only', 'two', 'other', 'tables', 'in', 'the', 'restaurant', 'Food', 'was', 'very', 'good', 'so', 'it', 'was', 'worth', 'the', 'wait', 'I', 'would', 'return', 'in', 'a', 'heartbeat', 'A', 'gem', 'in', 'chicago'], tags=[512470]),
 TaggedDocument(words=['Triple', 'A', 'rate', 'with', 'upgrade', 'to', 'view', 'room', 'was', 'less_than', 

In [26]:
unlabelled_index = list(range(len(unlabelled_corpus)))
np.random.shuffle(unlabelled_index)
kratio = 3
rand_unlabelled_corpus = [unlabelled_corpus[a] for a in unlabelled_index[:(len(true_corpus) + len(fake_corpus))*kratio]]

In [27]:
turk_model = gensim.models.doc2vec.Doc2Vec(dm=0, size=100,min_count=30, window=5,workers=cores, seed=8, negative=5)
turk_model.build_vocab(unlabelled_corpus)

In [28]:
turk_model.train(unlabelled_corpus, total_examples=turk_model.corpus_count, epochs=turk_model.iter)

59619392

In [29]:
# ranks = []
# second_ranks = []
# for doc_id in range(len(fake_corpus)):
#     inferred_vector = turk_model.infer_vector(true_corpus[doc_id].words)
#     sims = turk_model.docvecs.most_similar([inferred_vector], topn=len(turk_model.docvecs))
#     rank = [docid for docid, sim in sims].index(doc_id)
#     ranks.append(rank)
    
#     second_ranks.append(sims[1])
    
# collections.Counter(ranks)

In [30]:
# print('Document ({}): «{}»\n'.format(doc_id, ' '.join(fake_corpus[doc_id].words)))
# print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % turk_model)
# for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
#     print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(true_corpus[sims[index][0]].words)))

In [31]:
true_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in true_corpus])
fake_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in fake_corpus])
rand_unlabelled_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in rand_unlabelled_corpus])
unlabelled_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in unlabelled_corpus])

In [32]:
rand_unlabelled_vec.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.182161,0.127414,-0.258718,0.136064,0.040059,-0.270429,-0.233951,0.064347,-0.110299,0.238742,...,-0.127566,-0.058818,0.007708,-0.309235,-0.045261,0.159964,0.08551,-0.245836,-0.092757,0.148976
1,0.464405,0.020287,-0.153139,0.287114,-0.067047,-0.070573,0.050843,-0.185877,-0.114911,-0.262483,...,0.266384,-0.115207,0.205449,-0.230744,-0.296168,-0.007712,-0.186347,-0.191616,-0.140301,-0.002148
2,-0.10188,-0.078978,-0.239864,0.118205,0.027745,-0.16048,-0.195724,0.190249,0.159676,0.220301,...,0.120407,-0.216437,-0.159057,-0.189892,0.129861,0.059319,0.135825,-0.084024,-0.062895,0.060556
3,0.043078,0.165078,-0.19978,0.006671,-0.123814,-0.604992,0.118186,0.55839,0.14637,-0.64815,...,-0.097205,-0.060684,0.239619,-0.672383,-0.479075,-0.024389,0.200901,0.002955,-0.04103,0.124559
4,0.060076,0.200378,0.1338,-0.147105,-0.073055,-0.208019,-0.08232,0.026966,0.37969,-0.029548,...,-0.080645,-0.159408,-0.31851,-0.320421,0.279262,-0.114116,0.226733,0.450745,-0.112956,-0.078592


In [33]:
all_X = pd.concat([true_vec, fake_vec], axis=0)
all_y = pd.concat([truedfy, fakedfy], axis= 0)
X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, train_size=0.75, random_state=8)

# Perform baseline Supervised Learning 

In [34]:
def summary_report(y, y_pred, model):
    conmat = confusion_matrix(y, y_pred, labels=model.classes_)
    # converts np.matrix format matrix to a dataframe and adds index and column names
    conmat= pd.DataFrame(conmat, columns=model.classes_, index=model.classes_)
    print(conmat)
    print(classification_report(y, y_pred))

In [35]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

logreg_cv = linear_model.LogisticRegressionCV(Cs=100, cv=5, penalty='l1',scoring='accuracy',solver='liblinear',n_jobs=-1)
# logreg_cv.fit(X_train, y_train)
# logreg_cv_pred = logreg_cv.predict(X_test)
# summary_report(y_test, logreg_cv_pred, logreg_cv)
gnb = GaussianNB()
dtree = DecisionTreeClassifier()
svm = svm.SVC(kernel='rbf')
xg = XGBClassifier()

In [36]:
print('Gaussian NB:')
scorelist = cross_val_score(gnb, X_train, y_train, cv=5, scoring='f1',n_jobs=-1)
print(scorelist, np.mean(scorelist))
print('DecisionTree')
scorelist = cross_val_score(dtree, X_train, y_train, cv=5, scoring='f1', n_jobs=-1) 
print(scorelist, np.mean(scorelist))
print('Logistics Regression:')
scorelist = cross_val_score(logreg_cv, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
print(scorelist, np.mean(scorelist))
print('SVM:')
scorelist =cross_val_score(svm, X_train, y_train, cv=5, scoring='f1', n_jobs=-1) 
print(scorelist, np.mean(scorelist))
print('XGB Default:')
scorelist = cross_val_score(xg, X_train, y_train, cv=5, scoring='f1',n_jobs=-1)
print(scorelist, np.mean(scorelist))

Gaussian NB:
[0.71544715 0.66938776 0.65843621 0.671875   0.66938776] 0.6769067757334792
DecisionTree
[0.53043478 0.57959184 0.56809339 0.52941176 0.56431535] 0.5543694243920749
Logistics Regression:
[0.7295082  0.64978903 0.69491525 0.7007874  0.66122449] 0.6872448743730372
SVM:
[0.73076923 0.66935484 0.66923077 0.68181818 0.69201521] 0.6886376459306669
XGB Default:
[0.6695279  0.63934426 0.65060241 0.69019608 0.66386555] 0.6627072387158408


In [37]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import GridSearchCV

# knn_params = {
#     'n_neighbors':[1,3,7,15,21,30,40],
#     'weights':['uniform','distance'],
#     'metric':['euclidean','manhattan']
# }
# knn_gridsearch = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, verbose=1,n_jobs=-1)
# # knn_gridsearch.fit(X_train, y_train)
# # y_knn_pred = knn_gridsearch.predict(X_test)
# # summary_report(y_test, y_knn_pred, knn_gridsearch)
# # print(knn_gridsearch.best_params_)
# print(cross_val_score(knn_gridsearch, all_X, all_y, cv=5, scoring='f1',n_jobs=-1))

In [38]:
# from imblearn.over_sampling import SMOTE
# from imblearn.combine import SMOTEENN
# smoteenn = SMOTEENN(random_state=8, smote=SMOTE(random_state=8, k_neighbors=40, n_jobs=-1), n_jobs=-1)
# X_train_smn, y_train_smn = smoteenn.fit_sample(X_train, y_train)
# knn_gridsearch = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, verbose=1,n_jobs=-1)
# knn_gridsearch.fit(X_train_smn, y_train_smn)
# y_smn_pred = knn_gridsearch.predict(X_test)
# summary_report(y_test, y_smn_pred, knn_gridsearch)
# print(knn_gridsearch.best_params_)

In [39]:
# xg = XGBClassifier()
# xg.fit(X_train, y_train)
# y_xgsmn_pred = xg.predict(X_test)
# summary_report(y_test, y_xgsmn_pred, xg)
# print(cross_val_score(xg, all_X, all_y, cv=5, scoring='f1',n_jobs=-1))

# xg_params = {
#     'learning_rate':[0.05],
#     'colsample_bytree': np.linspace(0.15,0.85,10),
#     'max_depth': range(4,20,1),
#     'min_child_weight': range(1,10),
#     'gamma': np.linspace(0.01,20.0,5)
# }
# xg_grid = GridSearchCV(xg, xg_params, n_jobs=-1, cv=5, scoring='f1')
# xg_grid.fit(all_X, all_y)
# print (xg_grid.best_params_)


In [40]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier

# Start of Semi-Supervised Learning

In [41]:
# we pseudo label a bunch of unlabelled data

def pseudo_label(trained_model, labelled_X, labelled_y, unlabelled, sample_rate):
    unlabelled_index = list(range(len(unlabelled)))
    np.random.shuffle(unlabelled_index)
    end_index = int(sample_rate * float(len(unlabelled)))
    rand_unlabelled = pd.DataFrame([unlabelled.iloc[a,:] for a in unlabelled_index[:end_index]]).copy()
#     rand_unlabelled.reset_index(inplace=True,drop=True)
#     print(rand_unlabelled.head(10))
    all_data = pd.concat([labelled_X, rand_unlabelled], axis=0, ignore_index=True)
    p_labels = pd.DataFrame(trained_model.predict(rand_unlabelled))
    all_labels = pd.concat([labelled_y, p_labels], axis=0, ignore_index=True)
    
    trained_model.fit(all_data, all_labels)
    return trained_model, all_data , all_labels
#SMOTE BOOST
logreg_cv.fit(X_train, y_train)
newlr, newX_train, newy_train = pseudo_label(logreg_cv, X_train, y_train, rand_unlabelled_vec, 1.0)
y_lr_pred = newlr.predict(X_test)
summary_report(y_test, y_lr_pred, newlr)
newy_train.info()

     0    1
0  137   67
1   65  131
             precision    recall  f1-score   support

          0       0.68      0.67      0.67       204
          1       0.66      0.67      0.66       196

avg / total       0.67      0.67      0.67       400

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 1 columns):
0    6000 non-null int64
dtypes: int64(1)
memory usage: 47.0 KB


In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from scipy import sparse as sp

newX_train = pd.concat([X_train, rand_unlabelled_vec],axis=0)
newy_train = pd.concat([y_train, pd.Series([-1] *rand_unlabelled_vec.shape[0])],axis=0)
# newX_train_spread = newX_train.copy()
# newy_train_spread = newy_train.copy()

pseudo_params = {
    'kernel': ['rbf'],
    'gamma' : range(10,100,10)
}
label_prop_gridsearch = GridSearchCV(LabelPropagation(), pseudo_params, n_jobs=-1)
# label_spread_gridsearch = GridSearchCV(LabelSpreading(), pseudo_params)
label_prop_gridsearch.fit(newX_train, newy_train)
# label_spread_gridsearch.fit(newX_train_spread, newy_train_spread)

y_prop_grid_pred = label_prop_gridsearch.best_estimator_.predict(X_test)
# y_spread_grid_pred = label_spread_gridsearch.best_estimator_.predict(X_test)
summary_report(y_test, y_prop_grid_pred, label_prop_gridsearch.best_estimator_)
# summary_report(y_test, y_spread_grid_pred, label_spread_gridsearch.best_estimator_)

     0   1
0  118  86
1  118  78
             precision    recall  f1-score   support

          0       0.50      0.58      0.54       204
          1       0.48      0.40      0.43       196

avg / total       0.49      0.49      0.49       400



In [43]:
# pseudo_params2 = {
#     'kernel': ['knn'],
#     'alpha' : np.linspace(0.01,0.99,20)
# }
# label_prop_gridsearch = GridSearchCV(LabelPropagation(), pseudo_params2)
# label_spread_gridsearch = GridSearchCV(LabelSpreading(), pseudo_params2)
# label_prop_gridsearch.fit(newX_train, newy_train)
# label_spread_gridsearch.fit(newX_train_spread, newy_train_spread)

# y_prop_grid_pred = label_prop_gridsearch.best_estimator_.predict(X_test)
# y_spread_grid_pred = label_spread_gridsearch.best_estimator_.predict(X_test)
# summary_report(y_test, y_prop_grid_pred, label_prop_gridsearch.best_estimator_)
# summary_report(y_test, y_spread_grid_pred, label_spread_gridsearch.best_estimator_)

In [44]:
# # label_prop_model = LabelPropagation(kernel='rbf',gamma=10,n_jobs=-1)
# # label_spread_model = LabelSpreading(kernel='rbf',gamma=10,n_jobs=-1)
# # label_prop_model.fit(newX_train, newy_train)
# # label_spread_model.fit(newX_train_spread, newy_train_spread)
# y_prop_pred = label_prop_model.predict(X_test)
# y_spread_pred = label_spread_model.predict(X_test)

# summary_report(y_test, y_prop_pred, label_prop_model)
# summary_report(y_test, y_spread_pred, label_spread_model)