In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import *
import gensim
import collections
import time
import matplotlib.pyplot as plt
import subprocess
import shlex
from sklearn import svm
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from weakref import proxy
from pprint import pprint
# Does not display warnings
import warnings
warnings.filterwarnings("ignore")



### Download Required Data

#### Download all NLTK packages

In [0]:
!python -m nltk.downloader all

#### Download External Dependencies

In [5]:
# Download gdrive bash command
!wget -c https://docs.google.com/uc?id=0B3X9GlR6EmbnQ0FtZmJJUXEyRTA&export=download
!cp 'uc?id=0B3X9GlR6EmbnQ0FtZmJJUXEyRTA' /usr/local/bin/gdrive
!chmod a+x /usr/local/bin/gdrive

--2019-02-25 08:47:47--  https://docs.google.com/uc?id=0B3X9GlR6EmbnQ0FtZmJJUXEyRTA
Resolving docs.google.com (docs.google.com)... 74.125.203.100, 74.125.203.102, 74.125.203.138, ...
Connecting to docs.google.com (docs.google.com)|74.125.203.100|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-08-48-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/9r6j7n8pbmg5b991p4lgqmr38b6qjqiu/1551081600000/15876260727594163214/*/0B3X9GlR6EmbnQ0FtZmJJUXEyRTA [following]
--2019-02-25 08:47:48--  https://doc-08-48-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/9r6j7n8pbmg5b991p4lgqmr38b6qjqiu/1551081600000/15876260727594163214/*/0B3X9GlR6EmbnQ0FtZmJJUXEyRTA
Resolving doc-08-48-docs.googleusercontent.com (doc-08-48-docs.googleusercontent.com)... 64.233.189.132, 2404:6800:4008:c07::84
Connecting to doc-08-48-docs.googleusercontent.com (doc-08-48-docs.googleusercontent.com)|64.233.189.132|:443... con

In [6]:
# Download Word2Vec Vectors and Verify using any gmail account
!gdrive download 0B7XkCwpI5KDYNlNUTTlSS21pQmM
!gunzip GoogleNews-vectors-negative300.bin.gz

Authentication needed
Go to the following url in your browser:
https://accounts.google.com/o/oauth2/auth?access_type=offline&client_id=367116221053-7n0vf5akeru7on6o2fjinrecpdoe99eg.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&response_type=code&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&state=state

Enter verification code: 4/_AD-CLg3bql9pkZIwaSTW4GMu7LvhK1lK2vb9wLUibvILn-Xf3FRSuI
Downloading GoogleNews-vectors-negative300.bin.gz -> GoogleNews-vectors-negative300.bin.gz
Downloaded 0B7XkCwpI5KDYNlNUTTlSS21pQmM at 135.3 MB/s, total 1.6 GB


In [7]:
# Download SVMperf
!wget http://download.joachims.org/svm_perf/current/svm_perf_linux64.tar.gz
!gunzip svm_perf_linux64.tar.gz
!tar -xvf svm_perf_linux64.tar

--2019-02-25 08:49:19--  http://download.joachims.org/svm_perf/current/svm_perf_linux64.tar.gz
Resolving download.joachims.org (download.joachims.org)... 81.88.42.187, 81.88.34.174
Connecting to download.joachims.org (download.joachims.org)|81.88.42.187|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://osmot.cs.cornell.edu/svm_perf/current/svm_perf_linux64.tar.gz [following]
--2019-02-25 08:49:20--  http://osmot.cs.cornell.edu/svm_perf/current/svm_perf_linux64.tar.gz
Resolving osmot.cs.cornell.edu (osmot.cs.cornell.edu)... 128.253.51.182
Connecting to osmot.cs.cornell.edu (osmot.cs.cornell.edu)|128.253.51.182|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 150144 (147K) [application/x-gzip]
Saving to: ‘svm_perf_linux64.tar.gz’


2019-02-25 08:49:21 (257 KB/s) - ‘svm_perf_linux64.tar.gz’ saved [150144/150144]

LICENSE.txt
svm_perf_learn
svm_perf_classify


### Load Data

In [0]:
# Upload Dataset
from google.colab import files
uploaded = files.upload()

In [5]:
# Load Dataset to pandas Dataframe
original_data = pd.read_csv('../joshi_data/train', delimiter='\t', names=['text', 'label'])
labels = np.array(original_data['label'])
labels = pd.DataFrame(labels).replace('sarcasm', 1)
labels = labels.replace('philosophy', 0)

In [7]:
# Test Dataset
test_data = pd.read_csv('../joshi_data/test', delimiter='\t', names=['text', 'label'])
labels_test = np.array(test_data['label'])
labels_test = pd.DataFrame(labels_test).replace('sarcasm', 1)
labels_test = labels_test.replace('philosophy', 0)

In [0]:
# Shuffle if required
data_shuffle = original_data.sample(frac=1).reset_index(drop=True)
labels = np.array(data_shuffle['label'])

In [9]:
# Load Pretrained Vectors
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [0]:
!ls

### Preprocessing

In [0]:
def process_data(text,labels,n_gram,tokenizer=None,min_document_frequency=2,minfreq=3):
    '''
        Tokenizes the dataset and return the term-frequency matrix of the frequent n-grams 
        
        Parameters
        -----------
        text : list
            List of strings containing the text to be processed.
        n_gram: int
            The size of the n-gram to be returned
        tokenizer:
            String tokenizer to be used
        min_document_frequency: 
            The minimum document frequency to be used to prune n-grams
        minfreq: 
            The minimum frequency to be used to prune n-grams
            
        Returns
        -------
        dataframe : Pandas DataFrame
            The text converted as a term-frequency matrix
        n_grams : list
            The list of n-grams that were generated
        weights: tuple 
            p_value , chi2 value 
    '''
    vectorizer = CountVectorizer(tokenizer=tokenizer,lowercase=False,\
                                 ngram_range=(n_gram,n_gram),\
                                 min_df=min_document_frequency,\
                                 stop_words='english',\
                                 token_pattern='\\w+')
    
    processed_data = (vectorizer.fit_transform(text))                      
    processed_data = processed_data.toarray()                              
    
    n_grams = vectorizer.get_feature_names()                               
    
    counts = np.sum(processed_data,axis=0)                                 
    indices_to_keep = (np.argwhere(counts > minfreq)).flatten()            
    processed_data = processed_data[:,indices_to_keep] 
    n_grams = [n_grams[i]  for i in indices_to_keep]
    dataframe = pd.DataFrame(processed_data)
    
    weights = chi2(processed_data,labels)
    
    return dataframe,n_grams,weights

### Word Embedding based features (Joshi)

In [10]:

class Link(object):
    __slots__ = 'prev', 'next', 'key', '__weakref__'

class OrderedSet(collections.MutableSet):
    'Set the remembers the order elements were added'
    # Big-O running times for all methods are the same as for regular sets.
    # The internal self.__map dictionary maps keys to links in a doubly linked list.
    # The circular doubly linked list starts and ends with a sentinel element.
    # The sentinel element never gets deleted (this simplifies the algorithm).
    # The prev/next links are weakref proxies (to prevent circular references).
    # Individual links are kept alive by the hard reference in self.__map.
    # Those hard references disappear when a key is deleted from an OrderedSet.

    def __init__(self, iterable=None):
        self.__root = root = Link()         # sentinel node for doubly linked list
        root.prev = root.next = root
        self.__map = {}                     # key --> link
        if iterable is not None:
            self |= iterable

    def __len__(self):
        return len(self.__map)

    def __contains__(self, key):
        return key in self.__map

    def add(self, key):
        # Store new key in a new link at the end of the linked list
        if key not in self.__map:
            self.__map[key] = link = Link()            
            root = self.__root
            last = root.prev
            link.prev, link.next, link.key = last, root, key
            last.next = root.prev = proxy(link)

    def discard(self, key):
        # Remove an existing item using self.__map to find the link which is
        # then removed by updating the links in the predecessor and successors.        
        if key in self.__map:        
            link = self.__map.pop(key)
            link.prev.next = link.next
            link.next.prev = link.prev

    def __iter__(self):
        # Traverse the linked list in order.
        root = self.__root
        curr = root.next
        while curr is not root:
            yield curr.key
            curr = curr.next

    def __reversed__(self):
        # Traverse the linked list in reverse order.
        root = self.__root
        curr = root.prev
        while curr is not root:
            yield curr.key
            curr = curr.prev

    def pop(self, last=True):
        if not self:
            raise KeyError('set is empty')
        key = next(reversed(self)) if last else next(iter(self))
        self.discard(key)
        return key

    def __repr__(self):
        if not self:
            return '%s()' % (self.__class__.__name__,)
        return '%s(%r)' % (self.__class__.__name__, list(self))

    def __eq__(self, other):
        if isinstance(other, OrderedSet):
            return len(self) == len(other) and list(self) == list(other)
        return not self.isdisjoint(other)
      
# Normalization of input specific to extract joshi features
def joshi_normalize(df, lowercase=True ):
    # Tokenize the sentences
    wpt = nltk.WordPunctTokenizer()
    # Stop words list
    stop_words = nltk.corpus.stopwords.words('english')
    df = df.copy()
    # Iterate through pandas dataframe
    for i, row in df.iterrows():
        # row[1] contains the sentence string type = string
        new_str = re.sub(r'\\w+', '', row[1], re.I|re.A)
        # lowercase (doesnt matter as we are extracting word embedding features only)
        if(lowercase):
            new_str = new_str.lower()
        new_str = new_str.strip()
        # tokenize string
        tokens = wpt.tokenize(new_str)
        # filter stop words
        filtered_tokens = [token for token in tokens if token not in stop_words]
        # join the tokens back as single string
        new_str = ' '.join(filtered_tokens)
        # update dataframe
        df.iat[i, 1] = new_str
    
    # Return dataframe with normalized text
    return df

# Calculate 8 Word Embedding Features both unweighted [0-3] and weighted[4-7]
# Param : 'df' should have text as 2nd column; 'model': gensim model for selected pretrained vectors
def joshi_features(df, model):
    # Tokenizer to tokenize input string
    wpt = nltk.WordPunctTokenizer()
    # final 8 features for every sentence stored as row
    features = []
    
    # As we modify df, we will make a copy of the object
    df = df.copy()
    for ind, row in df.iterrows():
        # extract string
        string = row[1]
        # tokenize
        tokens = wpt.tokenize(string)
        
        # Create sim_matrix 
        length = len(tokens)
        
        # Unweighted similarity matrix
        sim_matrix = np.zeros((length, length), dtype='float32')
        sim_matrix.fill(np.nan)
        
        # Weighted Sim Matrix
        wsim_matrix = np.zeros((length, length), dtype='float32')
        wsim_matrix.fill(np.nan)
        
        # Build Matrix
        for i in range(length):
            for j in range(i+1, length):
                try:
                    sim_matrix[i][j] = model.similarity(tokens[i], tokens[j])
                    if sim_matrix[i][j]==1 :
                        # To remove repeated words 
                        raise Exception
                    sim_matrix[j][i] = sim_matrix[i][j]
                    
                    # Finding semantical distance {POINT TO IMPROVE}
                    wsim_matrix[i][j] = np.divide(sim_matrix[i][j], np.square(i-j))
                    wsim_matrix[j][i] = wsim_matrix[i][j]
                    
                except Exception as e:
                    # We fill nan if the word does not exist in model and ignore
                    sim_matrix[i][j] = np.nan
                    sim_matrix[j][i] = np.nan
                    wsim_matrix[i][j] = np.nan
                    wsim_matrix[j][i] = np.nan
        
        # Create dataframe for easier calulations
        sim_df = pd.DataFrame(sim_matrix)
        # Find most similar score for every word
        sim  = sim_df.max(axis=1)
        # Find most dissimilar score for every word
        dsim = sim_df.min(axis=1)
        
        # Create dataframe for easier calculations
        wsim_df = pd.DataFrame(wsim_matrix)
        # Find most similar score for every word
        wsim  = wsim_df.max(axis=1)
        # Find most dissimilar score for every word
        wdsim = wsim_df.min(axis=1)
        
        # Extract 8 features
        ff = [sim.max(axis=0), sim.min(axis=0), dsim.max(axis=0), dsim.min(axis=0), 
              wsim.max(axis=0), wsim.min(axis=0), wdsim.max(axis=0), wdsim.min(axis=0)]
        features.append(ff)
        
        # Print Progress
        print(ind)
    
    # Converting list to numpy array
    feat = np.array(features)
    # Converting numpy array to dataframe with appropriate column names
    feat_df = pd.DataFrame(feat, columns=['max_sim', 'min_sim', 'max_dsim', 'min_dsim', 
                                          'max_wsim', 'min_wsim', 'max_wdsim', 'min_wdsim' ])
    # Testing if nan values exist in final feature matrix
#      feat_df.isnull().any(1).nonzero()[0]
    return feat_df

In [29]:
tokenizer = RegexpTokenizer(r'\w+')
def joshi_features_new(df, model):
    # final 8 features for every sentence stored as row
    s_features = []
    ws_features = []
    sws_features = []
    
    # As we modify df, we will make a copy of the object
    df = df.copy()
    for ind, row in df.iterrows():
        # extract string
        string = row['text']
        # tokenize
        tokens = tokenizer.tokenize(string)
        tokens = OrderedSet(tokens)
        tokens = list(tokens)
        tokens = [x.lower() for x in tokens]
    
        X = []
        accepted_tokens = []
        for tok in tokens:
            try:
                if tok in accepted_tokens:
                    continue
                X.append(model.word_vec(tok))
                accepted_tokens.append(tok)
            except Exception as e:
                pass   
#         print(X)
#         print(accepted_tokens)
        sim_matrix = cosine_similarity(X)
#         print(sim_matrix[0,1])
#         print(model.similarity(accepted_tokens[0], accepted_tokens[1]))       
#         sim_matrix[sim_matrix>0.999] = np.nan        
        pos = []
        for tok in accepted_tokens:
            pos.append([tokens.index(tok)])
        weights = euclidean_distances(pos, squared=True)
        wsim_df = pd.DataFrame(sim_matrix/weights).replace(np.Inf, np.nan)
        
        sim_df = pd.DataFrame(sim_matrix)
        sim = sim_df.max(axis=1)
        dsim = sim_df.min(axis=1)
        # Find most similar score for every word
        wsim  = wsim_df.max(axis=1)
        # Find most dissimilar score for every word
        wdsim = wsim_df.min(axis=1)
        
        # Extract 8 features
        ff = [sim.max(axis=0), sim.min(axis=0), dsim.max(axis=0), dsim.min(axis=0), 
              wsim.max(axis=0), wsim.min(axis=0), wdsim.max(axis=0), wdsim.min(axis=0)]
        sws_features.append(ff)
        # Print Progress
        print(ind)
    
    # Testing if nan values exist in final feature matrix
    
    # Converting numpy array to dataframe with appropriate column names
    feat_df = pd.DataFrame(sws_features, columns=['max_sim', 'min_sim', 'max_dsim', 'min_dsim', 
                                          'max_wsim', 'min_wsim', 'max_wdsim', 'min_wdsim' ])
    if(len(feat_df.isnull().any(1).nonzero()[0])==0):
        print("No Nans")
    else:
        feat_df = feat_df.fillna(0)
    
    s_features  = feat_df.iloc[:, 0:4]
    ws_features = feat_df.iloc[:, 4:8]

    return feat_df

### Liebrecht Features

In [0]:
### Add your stuff here

### Gonzalez Features

** liwc_features(): **

Extracts 32 feature vector(both incidence and frequence) for each sentence using liwc dictionary*

*  *df needs to be cleaned text*
*  *Very basic implementation can be improved later, for now its slow*
*   *Portland stemmer to extract stems of words to match with LIWC dictionary*

Returns 2 dataframe (Frequency, Incidence)

In [0]:
def liwc_features(df, liwc):
    stemmer = PorterStemmer()
    wpt = nltk.WordPunctTokenizer()
    liwc_f = []
    liwc_i = []
    for ind, row in df.iterrows():
        string = row[1]
        tokens = wpt.tokenize(string)
        feat = []

        for tok in tokens:
            tok_stem = stemmer.stem(tok)
            tok_stem += '*'
            if(tok=='wrote'):
                # Ignore this token
                continue
            if (liwc.index == tok).any():
                temp = liwc.loc[tok]
                temp = list(temp)
                feat.append(temp)
            elif (liwc.index == tok_stem).any():
                temp = liwc.loc[tok_stem]
                temp = list(temp)
                feat.append(temp)
        if(len(feat)==0):
            for i in range(0,len(liwc.columns.values)):
                feat.append(0)
            freq = feat
            inci = feat
            liwc_f.append(freq)
            liwc_i.append(inci)
            print(ind)
            continue
        temp_df = pd.DataFrame(feat)
        freq = temp_df.sum(axis=0).to_list()
        inci = []
        for i in freq:
            if i>=1:
                inci.append(1)
            else:
                inci.append(0)
        liwc_f.append(freq)
        liwc_i.append(inci)
        print(ind)
    feature_names = list(liwc.columns.values)
    liwcf_df = pd.DataFrame(liwc_f, columns=feature_names)
    liwci_df = pd.DataFrame(liwc_i, columns=feature_names)
    
    return liwcf_df, liwci_df

In [0]:
  def get_gonz_features(df,labels, liwc, n_features=0):
    # Load Unigrams
    bag_of_words, unigrams, _ = process_data(df['text'], labels, n_gram=1)

    # Get the frequency and incidence features from LIWC
    liwcf_raw, liwci_raw = liwc_features(df, liwc)

    # Check for NAN values
    if(len(liwcf_raw.isnull().any(1).to_numpy().nonzero()[0])==0):
      print('No Problems')
    if(len(liwci_raw.isnull().any(1).to_numpy().nonzero()[0])==0):
      print('No Problems')

    # Save the feature dataframes
    liwcf_raw.to_pickle('liwc_f.pkl')
    liwci_raw.to_pickle('liwc_i.pkl')

    gonz_liwc_f = liwcf_raw
    gonz_liwc_i = liwci_raw

    ## chi2 test

    weightsf = chi2(liwcf_raw, labels)
    weightsi = chi2(liwci_raw, labels)
    # create dataframe for frequency
    chi2_fdf = pd.DataFrame(weightsf, columns=liwc.columns.values, index=['chi2', 'pval'])
    # create dataframe for incidence
    chi2_idf = pd.DataFrame(weightsi, columns=liwc.columns.values, index=['chi2', 'pval'])

    # 10 Important features
    if n_features != 0:
      gonz_liwc_f = list(chi2_fdf.loc['chi2'].nlargest(n_features).index)
      gonz_liwc_i = list(chi2_idf.loc['chi2'].nlargest(n_features).index)
      gonz_liwc_f = liwcf_raw[gonz_liwc_f]
      gonz_liwc_i = liwci_raw[gonz_liwc_i]

    # Change Names
    old_names = list(gonz_liwc_f.columns)
    new_names = []
    for old in old_names:
        new_names.append(old+'{Frequency}')
    gonz_liwc_f.columns = new_names

    # Change Names
    old_names = list(gonz_liwc_i.columns)
    new_names = []
    for old in old_names:
        new_names.append(old+'{Incidence}')
    gonz_liwc_i.columns = new_names

    bag_of_words = pd.DataFrame(bag_of_words, columns=unigrams )
    gonz_features = pd.concat([bag_of_words, gonz_liwc_i, gonz_liwc_f], axis=1)

    return gonz_features

In [0]:
# Load LIWC dictionary (32 feature columns + 1 word name) (4487 words)
liwc = pd.read_pickle('liwc32.pkl')

### Create Feature Sets

In [0]:
def getFsets(base_df, joshi_df):
  fset1 = base_df
  fset2 = joshi_df.iloc[:, 0:4]
  fset2 = pd.concat([base_df, fset2], axis=1)
  fset3 = joshi_dd.iloc[:, 4:8]
  fset3 = pd.concat([base_df, fset3], axis=1)
  fset4 = pd.concat([base_df, joshi_df], axis=1)
  
  return fset1, fset2, fset3, fset4  

In [0]:
# Will always need this
wembedding_similarity_features = joshi_features_new(orginal_data, word2vec_model)

#### Gonzalez Feature Sets

In [0]:
# Upload LIWC features pkl
from google.colab import files
files.upload()

In [0]:
gonz_features = get_gonz_features(original_data, labels, liwc, n_features=10)

In [0]:
gonz, gonz_s, gonz_w, gonz_sw = getFsets(gonz_features, wembedding_similarity_features)

#### Liebrechet Feature Sets

In [0]:
# Put your code here

### Training 

#### SVMperf

In [0]:
def shuffle(df):
    '''
        Shuffles pandas Dataframe
        
        Parameters
        -----------
        df : pd.DataFrame
            Dataframe to be shuffled
    
                
        Returns
        -------
        df_copy : Pandas DataFrame
            Shuffled Dataframe copy is returned    
    '''
    df_copy = df.copy()
    df_copy = df_copy.sample(frac=1).reset_index(drop=True)
    return df_copy

# For SVMperf

def store(file_name,data,labels):
#     <line> .=. <target> <feature>:<value> <feature>:<value> ... <feature>:<value> # <info>
#     <target> .=. {+1,-1}
#     <feature> .=. <integer>
#     <value> .=. <float>
#     <info> .=. <string>
    labels = 2*labels - 1
    with open(file_name+'.dat','w') as f :
        final_string = ""
        for idx,row in data.iterrows():
            string = ''
            string += '+1 ' if labels[idx] ==1 else '-1 '
            column_indices = data.columns
            for index in column_indices:
                if row[index] == 0:
                    continue
                string += str((index+1))+':'+str(row[index])+' '
            string += '\n'
            final_string += string
        f.write(final_string[:-1])
        print('Finished Storing')
        return final_string

def customKfold(data, labels, n_folds=5, random_seed=99):
    # Note: Will not shuffle data - Shuffle before and pass
    '''
        Stores train and test .dat files for each fold
        
        Parameters
        -----------
        data : pd.DataFrame
            Feature vectors
        labels: list
            True Labels
        n_folds:
            Number of folds required
        random_seed: 
            Random seed to generate the split
    '''
    from sklearn.model_selection import KFold
    kfold = KFold(n_folds, True, random_seed)
    iteration = 1
    for train, test in kfold.split(data):
        print(iteration)
        X_train, X_test, y_train, y_test = np.array(data.iloc[train]), np.array(data.iloc[test]), labels[train], labels[test]
        X_train = pd.DataFrame(X_train)
        X_test = pd.DataFrame(X_test)
        store('train_'+str(iteration), X_train, y_train)
        store('test_'+str(iteration), X_test, y_test)
        iteration += 1

In [0]:
customKfold(joshi_df_new, labels)

In [0]:
# Set this for number of Folds to run
N_FOLDS = 5
output_list = []
for i in range(0, N_FOLDS):
  bashCommand = "./svm_perf_learn -c 20.0 -l 1 -w 3 "+"train_"+str(i)+".dat"+"model_"+str(i)+".dat"
  process = subprocess.Popen(shlex.split(bashCommand), stdout=subprocess.PIPE)
  output, error = process.communicate()
  output_list.append([output, error])

output_list

In [0]:
!./svm_perf_learn -c 20.0 -l 1 -w 3 train_1.dat model_1.dat

In [0]:
!./svm_perf_classify test_1.dat model_1.dat predictions

#### SVMrbf 

In [0]:
# Initialize model
model = svm.SVC(gamma='scale', class_weight='balanced', C=20.0, cache_size=1000)

def classify(data, labels):
    # 10 is pseudo random number
    kfold = KFold(5, True, 100)
    scores = []
    i = 0
    for train, test in kfold.split(data):
        print(i)
        X_train, X_test, y_train, y_test = data.iloc[train], data.iloc[test], labels[train], labels[test]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        metric = precision_recall_fscore_support(y_test, y_pred)
        scores.append(metric)
        print("Done Interation: %d"%(i))
        i += 1
    return scores

In [0]:
score = classify(gonz_features, labels)

In [0]:
import pprint
from sklearn.metrics import confusion_matrix
x = 1027
y_pred = model.predict(gonz_features)
confusion = confusion_matrix(labels, y_pred)
pprint.pprint(confusion)

### Results

In [0]:
plt.title('precision')
plt.plot([x for x in range(len(scores))],[score[0][1] for score in scores])
plt.show()

In [0]:
print(score)

In [0]:
score = scores
plt.title('recall')
plt.plot([x for x in range(len(scores))],[score[1][1] for score in scores])
plt.show()

In [0]:
score = scores
plt.title('F-score')
plt.plot([x for x in range(len(scores))],[score[2][1] for score in scores])
plt.show()

In [0]:
score = scores
f_scores = [score[2][1] for score in scores]
print(np.array(f_scores).mean())
# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f_scores, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f_scores, p))
print('%.1f confidence interval: %.1f and %.1f' % (alpha*100, lower*100, upper*100))