In [2]:
import numpy as np
import scipy.stats as stats
import csv
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import gensim as gs 
from gensim import corpora, models, similarities
import logging
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,make_scorer, recall_score,precision_score,fbeta_score

import gensim,logging
from gensim.parsing import PorterStemmer
from gensim.models import Word2Vec, Doc2Vec, Phrases
from gensim.models.phrases import Phraser
from wikipedia import search,page
import multiprocessing
import collections
import re
import warnings
import spacy
import nltk
import pickle

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
warnings.filterwarnings(action='ignore')

assert gensim.models.doc2vec.FAST_VERSION > -1

np.random.seed(0)
cores = multiprocessing.cpu_count()



In [3]:
df = pd.read_csv('../input/deceptive-opinion.csv')

stop_words = pd.read_csv('../input/stopwords.csv',names=['stop'])
new_stop = stop_words.stop.map(lambda x: str.capitalize(x))
all_stop_set = set(stop_words.stop.append(new_stop,ignore_index=True))


In [4]:
#generate combinations of all_stop_words, bigrams
#generate bigram
bigram_stops=[]
for a in all_stop_set:
    for b in all_stop_set:
        bigram_stop = a+"_"+b
        bigram_stops.append(bigram_stop)

In [5]:
all_stop_set = all_stop_set.union(bigram_stops)


In [6]:
def read_corpus(sentdf, tokens_only=False):
    for i, line in enumerate(sentdf):
        if tokens_only:
            yield list(gensim.utils.tokenize(line))
        else:
            yield gensim.models.doc2vec.TaggedDocument(list(gensim.utils.tokenize(line)),[i])

In [7]:
truedf = df[df.deceptive=='truthful'].loc[:,'text']
fakedf = df[df.deceptive=='deceptive'].loc[:,'text']
truedfy = df[df.deceptive=='truthful'].loc[:,'deceptive']
fakedfy = df[df.deceptive=='deceptive'].loc[:,'deceptive']
truedfy.replace({'truthful':1},inplace=True)
fakedfy.replace({'deceptive':0},inplace=True)

In [8]:
truedf.iloc[189]

'I stayed at The Talbott for 3 nights on business and was very pleased. The staff was friendly as can be immediately confirming the lore of the midwest. I was upgraded to a suite which was bigger than my apartment and certainly more luxurious. The free wi-fi came in handy as I needed to work remotely while there. Everything from the comfort of the bed to the staff and location made this a great stay. Oh, and I got to workout at the huge Equinox right next door for free. \n'

In [9]:
from spacy.tokenizer import Tokenizer
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

# prefix_re = re.compile(r'''^[\[\("']''')
# suffix_re = re.compile(r'''[\]\)"']$''')
# infix_re = re.compile(r'''[-~]''')
# simple_url_re = re.compile(r'''^https?://''')

# nlp.tokenizer = Tokenizer(nlp.vocab, 
#                           prefix_search = prefix_re.search,
#                           suffix_search = suffix_re.search,
#                           infix_finditer = infix_re.finditer,
#                           token_match = simple_url_re.match)

In [10]:
true_corpus = list(read_corpus(truedf,tokens_only=True))
fake_corpus = list(read_corpus(fakedf,tokens_only=True))
# def to_list(doc):
#     return [t.text for t in doc]
# true_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(truedf)]
# fake_corpus = [to_list(nlp(s))[:-1] for ind,s in enumerate(fakedf)]

In [11]:
phrase_list = []

sentence_stream = [s for s in true_corpus + fake_corpus]
phrases = Phrases(sentence_stream)
bigram = Phraser(phrases)
sentence_stream = [bigram[s] for s in sentence_stream]
true_corpus = [bigram[s] for s in true_corpus]
fake_corpus = [bigram[s] for s in fake_corpus]

phrases = Phrases(sentence_stream)
trigram = Phraser(phrases)
sentence_stream = [trigram[s] for s in sentence_stream]
true_corpus = [trigram[s] for s in true_corpus]
fake_corpus = [trigram[s] for s in fake_corpus]

def filter_stream(df, stopword_set):
    sentence_stream = []
    for s in df:
        s2 = [c  for c in s if c not in stopword_set] 
        sentence_stream.append(s2)
    return sentence_stream

# sentence_stream2 = []
# for s in sentence_stream:
#     s2 = [c  for c in s if c not in all_stop_set] 
#     sentence_stream2.append(s2)

sentence_stream2 = filter_stream(sentence_stream, all_stop_set)
true_corpus = filter_stream(true_corpus, all_stop_set)
fake_corpus = filter_stream(fake_corpus, all_stop_set)

2018-01-25 16:59:03,203 : INFO : collecting all words and their counts
2018-01-25 16:59:03,205 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2018-01-25 16:59:03,659 : INFO : collected 96727 word types from a corpus of 239406 words (unigram + bigrams) and 1600 sentences
2018-01-25 16:59:03,661 : INFO : using 96727 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2018-01-25 16:59:03,663 : INFO : source_vocab length 96727
2018-01-25 16:59:04,798 : INFO : Phraser built with 828 828 phrasegrams
2018-01-25 16:59:06,011 : INFO : collecting all words and their counts
2018-01-25 16:59:06,013 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2018-01-25 16:59:06,430 : INFO : collected 104954 word types from a corpus of 220371 words (unigram + bigrams) and 1600 sentences
2018-01-25 16:59:06,432 : INFO : using 104954 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2018-01-2

In [14]:
true_corpus[:4]

[['We_stayed',
  'one_night',
  'getaway',
  'family',
  'thursday',
  'Triple',
  'AAA',
  'rate',
  'steal',
  'th_floor',
  'room',
  'complete_with',
  'plasma',
  'TV',
  'bose',
  'stereo',
  'voss',
  'evian',
  'water',
  'gorgeous',
  'bathroom',
  'tub',
  'fine',
  'us',
  'Concierge',
  'very_helpful',
  'beat',
  'location',
  'flaw',
  'breakfast',
  'pricey',
  'service',
  'very_slow',
  'hours',
  'four',
  'kids',
  'four',
  'adults',
  'friday',
  'morning',
  'even_though',
  'two',
  'tables',
  'restaurant',
  'Food',
  'very_good',
  'worth',
  'wait',
  'return',
  'heartbeat',
  'gem',
  'chicago'],
 ['Triple',
  'rate',
  'upgrade',
  'view',
  'room',
  'less_than',
  'also',
  'included',
  'breakfast',
  'vouchers',
  'a_great_view',
  'river',
  'lake',
  'Wrigley',
  'Bldg',
  'Tribune',
  'Bldg',
  'major',
  'restaurants',
  'Shopping',
  'Sightseeing',
  'attractions',
  'within_walking_distance',
  'Large',
  'room',
  'very_comfortable',
  'bed'],
 

In [12]:
pickle.dump(true_corpus, open('../input/true_corpus_clean.p','wb'))
pickle.dump(fake_corpus, open('../input/fake_corpus_clean.p','wb'))

In [15]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

compound = []
neg = []
neu = []
pos = []
# raw_corpus = true_corpus + fake_corpus
for s in sentence_stream2:
    sent = sia.polarity_scores(' '.join(s))
    compound.append(sent['compound'])
    neg.append(sent['neg'])
    neu.append(sent['neu'])
    pos.append(sent['pos'])

vader_sent = pd.DataFrame({'compound':compound, 'neg':neg, 'neu':neu, 'pos':pos})


In [16]:
from empath import Empath
lexicon = Empath()

lexicon_results = pd.DataFrame(columns=lexicon.cats)
for ind, s in enumerate(sentence_stream2):
    lexicon_results = lexicon_results.append(pd.Series([np.nan]), ignore_index=True)
    results = (lexicon.analyze(s))
    if (ind % 100 == 0):
        print(ind)
    for k in results.keys():
        lexicon_results[k].iloc[ind] = results[k]

lexicon_results.drop(columns=[0],inplace=True)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


In [17]:
lexicon_results.head(7)

Unnamed: 0,help,office,dance,money,wedding,domestic_work,sleep,medical_emergency,cold,hate,...,weapon,children,monster,ocean,giving,contentment,writing,rural,positive_emotion,musical
0,0.0,1.0,0.0,2.0,1.0,3.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0
1,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,1.0,1.0,2.0,3.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,1.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0
5,1.0,4.0,0.0,0.0,0.0,2.0,3.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0
6,2.0,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0


In [18]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from gensim import corpora, models, similarities

# unique_dict = corpora.Dictionary(sentence_stream2)
# dict_corpus = [dictionary.doc2bow(s) for s in sentence_stream2]
# tfidf = models.TfidfModel(dict_corpus)
# dict_corpus_tfidf = tfidf[dict_corpus]
# index = similarities.MatrixSimilarity(dict_corpus_tfidf)
# sims = index[dict_corpus_tfidf]
# for ind,a in enumerate(dict_corpus_tfidf):
#     print('1: ' + str(len(dict_corpus_tfidf[ind])))
#     print('2: ' + str(len(sentence_stream[ind])))

In [19]:
true_fake_corpus = []
for ind,s in enumerate(sentence_stream2):
    true_fake_corpus.append(gensim.models.doc2vec.TaggedDocument(s,[ind]))


In [20]:
dictionary = corpora.Dictionary(sentence_stream2)
corpus_bow = [dictionary.doc2bow(s) for s in sentence_stream2]

2018-01-25 17:08:28,239 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-01-25 17:08:28,400 : INFO : built Dictionary(11790 unique tokens: ['We_stayed', 'one_night', 'getaway', 'family', 'thursday']...) from 1600 documents (total 109981 corpus positions)


In [21]:
len(corpus_bow)

1600

In [20]:
num_topics = 100
chunksize = 400
passes = 5

model = models.LdaModel(corpus_bow[:100], id2word=dictionary, num_topics=num_topics,alpha = 'auto',eta='auto',random_state=0, chunksize=chunksize, passes=passes)

# model.update(corpus_bow[500:len(corpus_bow)])

2018-01-25 16:24:20,529 : INFO : using autotuned alpha, starting with [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]
2018-01-25 16:24:20,534 : INFO : using serial LDA version on this node
2018-01-25 16:24:24,971 : INFO : running online (multi-pass) LDA training, 100 topics, 5 passes over the supplied corpus of 100 documents, updating model once every 100 documents, evaluating perplexity every 100 documents, iterating 50x with a convergence th

2018-01-25 16:24:33,832 : INFO : optimized alpha [0.008854324802392559, 0.009118050713554862, 0.00894319120190614, 0.00954522474640571, 0.009574472879825168, 0.00912434890063259, 0.00903658704806465, 0.009387100105339236, 0.00930748151727069, 0.009577586200050671, 0.0087715064862365, 0.008858281505989517, 0.00948760806283971, 0.010233253000079885, 0.009210316936522505, 0.009777175676536807, 0.00930140413149752, 0.009125706491368419, 0.01053498883866705, 0.009581953280806155, 0.009036333309267452, 0.010058206283983314, 0.009114797042171852, 0.008771506527073846, 0.0087715064862365, 0.011469263924666465, 0.009299193950250749, 0.009727956566821683, 0.009270537786852292, 0.008858083128203183, 0.00912376554342443, 0.009124179357920807, 0.0087715064862365, 0.010102282370207613, 0.008858393689077534, 0.00920771338163465, 0.009487863211762091, 0.009029145867357953, 0.008859111724894756, 0.009398135734070908, 0.00885731418322809, 0.0087715064862365, 0.009399765177641212, 0.008945070373147988, 0

In [21]:
topic_dists = np.zeros([len(sentence_stream2),num_topics])
    
for i,item in enumerate(corpus_bow):       
    dists = model.get_document_topics(item)        
    indices = list(dict(dists).keys())        
    vals = list(dict(dists).values())        
    topic_dists[i,indices] = vals

topic_dists = pd.DataFrame(topic_dists, columns = ['topic'+str(a) for a in range(num_topics)])

In [22]:
topic_dists.head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,...,topic90,topic91,topic92,topic93,topic94,topic95,topic96,topic97,topic98,topic99
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.967205,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.971327,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
turk_model = gensim.models.doc2vec.Doc2Vec(dm=0, size=100,min_count=1, window=5,workers=cores, seed=8, negative=5)
turk_model.build_vocab(true_fake_corpus)

2018-01-25 17:08:28,613 : INFO : collecting all words and their counts
2018-01-25 17:08:28,616 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-01-25 17:08:28,643 : INFO : collected 11790 word types and 1600 unique tags from a corpus of 1600 examples and 109981 words
2018-01-25 17:08:28,645 : INFO : Loading a fresh vocabulary
2018-01-25 17:08:28,670 : INFO : min_count=1 retains 11790 unique words (100% of original 11790, drops 0)
2018-01-25 17:08:28,672 : INFO : min_count=1 leaves 109981 word corpus (100% of original 109981, drops 0)
2018-01-25 17:08:28,754 : INFO : deleting the raw counts dictionary of 11790 items
2018-01-25 17:08:28,757 : INFO : sample=0.001 downsamples 27 most-common words
2018-01-25 17:08:28,759 : INFO : downsampling leaves estimated 104543 word corpus (95.1% of prior 109981)
2018-01-25 17:08:28,761 : INFO : estimated required memory for 11790 words and 100 dimensions: 15967000 bytes
2018-01-25 17:08:28,802 : INFO : resetting lay

In [23]:
turk_model.train(true_fake_corpus, total_examples=turk_model.corpus_count, epochs=turk_model.iter)

2018-01-25 17:08:28,962 : INFO : training model with 8 workers on 11790 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
2018-01-25 17:08:29,608 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-01-25 17:08:29,628 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-01-25 17:08:29,643 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-01-25 17:08:29,665 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-01-25 17:08:29,672 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-01-25 17:08:29,676 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-01-25 17:08:29,680 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-01-25 17:08:29,683 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-01-25 17:08:29,685 : INFO : training on 549905 raw words (530533 effective words) took 0.7s, 743384 effective words/s

530533

In [25]:
from sklearn.preprocessing import StandardScaler
true_fake_vec = pd.DataFrame([turk_model.infer_vector(s.words) for s in true_fake_corpus])
# vader_sent = vader_sent.apply(np.square)
true_fake_vec2 = pd.concat([true_fake_vec, vader_sent, lexicon_results], axis=1)

# ss = StandardScaler()
# true_fake_vec_all = ss.fit_transform(true_fake_vec)

In [27]:
all_y = pd.concat([truedfy, fakedfy], axis= 0)
X_train, X_test, y_train, y_test = train_test_split(true_fake_vec, all_y, train_size=0.75, random_state=8)
Xv_train, Xv_test, yv_train, yv_test = train_test_split(vader_sent, all_y,train_size=0.75, random_state=8)
Xe_train, Xe_test, ye_train, ye_test = train_test_split(lexicon_results, all_y, train_size=0.75, random_state=8)
# Xlda_train, Xlda_test, ylda_train, ylda_test = train_test_split(topic_dists, all_y, train_size=0.75, random_state=8)
Xa_train, Xa_test, ya_train, ya_test = train_test_split(true_fake_vec2, all_y,train_size=0.75, random_state=8)

In [28]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

def run_test(X, y, cv_val, scoring):
    gnb = GaussianNB()
    dtree = DecisionTreeClassifier()
    svm2 = svm.SVC(random_state=8)
    xg = XGBClassifier()
    # logreg_cv = linear_model.LogisticRegressionCV(Cs=100, cv=5, penalty='l1',scoring='accuracy',solver='liblinear',n_jobs=-1)
    print('Gaussian NB:')
    scorelist = cross_val_score(gnb, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('DecisionTree')
    scorelist = cross_val_score(dtree, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('SVM:')
    scorelist = cross_val_score(svm2, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    print('XGB Default:')
    scorelist = cross_val_score(xg, X, y, cv=cv_val, scoring=scoring,n_jobs=-1)
    print(scorelist, np.mean(scorelist))
    # print('Logistics Regression:')
    # scorelist = cross_val_score(logreg_cv, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    # print(scorelist, np.mean(scorelist))

In [28]:
run_test(Xe_train, ye_train, 5, 'f1')

Gaussian NB:
[0.54634146 0.53968254 0.69343066 0.43850267 0.5049505 ] 0.5445815657755554
DecisionTree
[0.63709677 0.63453815 0.55645161 0.57758621 0.58634538] 0.5984036256259744
SVM:
[0.74789916 0.65546218 0.73684211 0.70995671 0.65822785] 0.7016776015717897
XGB Default:
[0.76422764 0.66666667 0.68273092 0.65437788 0.66949153] 0.6874989276491859


In [29]:
run_test(Xv_train, yv_train, 5, 'f1')

Gaussian NB:
[0.59385666 0.57342657 0.63272727 0.58992806 0.60839161] 0.5996660334779028
DecisionTree
[0.52       0.42060086 0.47161572 0.50622407 0.56153846] 0.49599582136432385
SVM:
[0.6402439  0.62420382 0.64126984 0.64353312 0.64797508] 0.6394451532549856
XGB Default:
[0.55833333 0.5021645  0.5511811  0.52542373 0.56903766] 0.541228064715473


In [30]:
run_test(X_train, y_train, 5, 'f1')

Gaussian NB:
[0.609319   0.55970149 0.67375887 0.57553957 0.63481229] 0.6106262418472109
DecisionTree
[0.59414226 0.47058824 0.53877551 0.47533632 0.47863248] 0.5114949612829718
SVM:
[0.66850829 0.67036011 0.67036011 0.67036011 0.66852368] 0.6696224593166027
XGB Default:
[0.49392713 0.55737705 0.47457627 0.53941909 0.48535565] 0.5101310363090672


In [31]:
run_test(Xlda_train, ylda_train, 5, 'f1')

Gaussian NB:
[0.58015267 0.56573705 0.61643836 0.40236686 0.59778598] 0.5524961842956082
DecisionTree
[0.50209205 0.5982906  0.5483871  0.52589641 0.62761506] 0.5604562444756265
SVM:
[0.66850829 0.67036011 0.67036011 0.67036011 0.66852368] 0.6696224593166027
XGB Default:
[0.64615385 0.58436214 0.60162602 0.61904762 0.64957265] 0.6201524541903946


In [29]:
run_test(Xa_train, ya_train, 5, 'f1')

Gaussian NB:
[0.57021277 0.58091286 0.67153285 0.55506608 0.6124031 ] 0.5980255311627326
DecisionTree
[0.55793991 0.52252252 0.54098361 0.59349593 0.53556485] 0.5501013663517649
SVM:
[0.74893617 0.64912281 0.73170732 0.71861472 0.64102564] 0.6978813307887679
XGB Default:
[0.72268908 0.60082305 0.64777328 0.68085106 0.62337662] 0.6551026174912759


In [30]:
svm_clf = svm.SVC(random_state=8)
svm_params = {
    "kernel":['rbf','linear'],
    'C':[0.1,0.2,0.4,0.6,0.8,1,10],
    'gamma': np.logspace(-1,1,9)
}
scorer = make_scorer(fbeta_score,beta=0.5)
svm_gs = GridSearchCV(svm_clf, svm_params, cv=5, scoring=scorer, n_jobs=-1)
svm_gs.fit(Xa_train,ya_train)
best_clf = svm_gs.best_estimator_
print(best_clf)
best_pred = best_clf.predict(Xa_test)

SVC(C=0.2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
  max_iter=-1, probability=False, random_state=8, shrinking=True,
  tol=0.001, verbose=False)


In [31]:
performance = {'accuracy': accuracy_score(best_pred,y_test),
                'recall': recall_score(best_pred,y_test),
                'precision': precision_score(best_pred,y_test)}
print(performance)

{'accuracy': 0.6975, 'recall': 0.7027027027027027, 'precision': 0.6632653061224489}
