# Module 7: Linear Support Vector Machine - Practice

In this session, you will practice using Linear SVM on **red wine** dataset
with the typical train/validate workflow.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, sys
import itertools
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
################################################################
''' Preporcessing steps: 
1. lowercasing 
2. Digit -> DDD 
3. URLs -> httpAddress 
4. @username -> userID 
5. Remove special characters, keep ; . ! ? 
6. normalize elongation 
7. tokenization using tweetNLP
output is ~/Dropbox (QCRI)/AIDR-DA-ALT-SC/data/labeled datasets/prccd_data/{filename}_AIDR_prccd.csv
'''
#################################################################

#=================
#==> Libraries <==
#=================
import re, os
import string 
import sys
import twokenize
import csv
from collections import defaultdict
from os.path import basename
import ntpath
import codecs
import unicodedata

def process(lst):
    prccd_item_list=[]
    for tweet in lst:


#         # Normalizing utf8 formatting
#         tweet = tweet.decode("unicode-escape").encode("utf8").decode("utf8")
#         #tweet = tweet.encode("utf-8")
#         tweet = tweet.encode("ascii","ignore")
#         tweet = tweet.strip(' \t\n\r')

        # 1. Lowercasing
        tweet = tweet.lower()
        #print "[lowercase]", tweet

        # Word-Level
        tweet = re.sub(' +',' ',tweet) # replace multiple spaces with a single space

        # 2. Normalizing digits
        tweet_words = tweet.strip('\r').split(' ')
        for word in [word for word in tweet_words if word.isdigit()]:
            tweet = tweet.replace(word, "D" * len(word))
#         print( "[digits]", tweet)

        # 3. Normalizing URLs
        tweet_words = tweet.strip('\r').split(' ')
        for word in [word for word in tweet_words if '/' in word or '.' in word and  len(word) > 3]:
            tweet = tweet.replace(word, "")
#         print( "[URLs]", tweet)

        #4. Normalizing username

        tweet_words = tweet.strip('\r').split(' ')
        try:
            for word in [word for word in tweet_words if word[0] == '@' and len(word) > 1]:
                tweet = tweet.replace(word, "")
#         print( "[usrename]", tweet)
        except:
            tweet = tweet


        # 5. Removing special Characters
        punc = '@$%^&*()_+-={}[]:"|\'\~`<>/,'
        trans = str.maketrans(punc, ' '*len(punc))
        tweet = tweet.translate(trans)
        #print( "[punc]", tweet)

        # 6. Normalizing +2 elongated char
        tweet = re.sub(r"(.)\1\1+",r'\1\1', tweet)
        #print ("[elong]", tweet)

        # 7. tokenization using tweetNLP
        tweet = ' '.join(twokenize.simpleTokenize(tweet))
        #print( "[token]", tweet )

        #8. fix \n char
        tweet = tweet.replace('\n', ' ')

        prccd_item_list.append(tweet.strip())
#         print ("[processed]", tweet.replace('\n', ' '))
        
    return prccd_item_list

In [3]:
######################
# Loading
######################

code_dir = os.getcwd()
parent_dir = os.path.dirname(code_dir)
print(parent_dir)
labled_data_folder  =  os.path.join(parent_dir,"Data/crisis_datasets_benchmarks/all_data_en")
initial_filtering_folder = os.path.join(parent_dir,"Data/crisis_datasets_benchmarks/initial_filtering")


train = pd.read_table(os.path.join
                       (labled_data_folder,
                                    "crisis_consolidated_informativeness_filtered_lang_en_train.tsv"))
test = pd.read_table(os.path.join
                       (labled_data_folder,
                                    "crisis_consolidated_informativeness_filtered_lang_en_test.tsv"),
                       sep ='\t', quoting =3)
dev = pd.read_table(os.path.join
                     (labled_data_folder,
                                  "crisis_consolidated_informativeness_filtered_lang_en_dev.tsv"))
combinedf = pd.concat([train,test,dev])
df_list = [combinedf,train, test, dev]
df_list_name = ['combinedf','train', 'test', 'dev']
proc_list = []
for i, df in enumerate(df_list):
    print("processing: "+df_list_name[i])
    df['processed_txt'] = process(df['text'])
    print("processing: "+df_list_name[i] + " complete")

/Volumes/Elements/DataScience/dsa/capstone
processing: combinedf
processing: combinedf complete
processing: train
processing: train complete
processing: test
processing: test complete
processing: dev
processing: dev complete


## Load dataset

In the following cell, **print out class distribution** before and after labels are binarized respectively.

In [10]:
# Dataset location
DATASET = '/dsa/data/all_datasets/wine-quality/winequality-red.csv'
assert os.path.exists(DATASET)

# Load and shuffle
dataset = pd.read_csv(DATASET, sep=';').sample(frac = 1).reset_index(drop=True)

## Create linear SVM model

Collect names for all selected feature columns.

Create feature columns placeholders for TensorFlow SVM.

In [20]:
type(feature_columns)

list

Create a linear classifier.

In [14]:
# Complete code below this comment  (Question #P6003)
# ----------------------------------
classifier = tf.contrib.learn.SVM('example_id', feature_columns=feature_columns, l2_regularization=1.0)

## Training and preparation

Create input_fn() to supply training data for linear SVM.

In [20]:
# Complete code below this comment  (Question #P6004)
# ----------------------------------
def input_fn():
    columns = {
        feature_name: tf.constant(np.expand_dims(X_train[:,i], 1))
            for i,feature_name in enumerate(feature_names)
    }
    columns['example_id'] = tf.constant([str(i+1) for i in range(len(X_train))])
    labels = tf.constant(y_train)
    return columns, labels


Train SVM.

In [21]:
# Add code below this comment  (Question #P6005)
# ----------------------------------


classifier.fit(input_fn=input_fn, steps=30)







SVM(params={'feature_columns': [_RealValuedColumn(column_name='volatile_acidity', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='total_sulfur_dioxide', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='sulphates', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='alcohol', dimension=1, default_value=None, dtype=tf.float32, normalizer=None)], 'weight_column_name': None, 'head': <tensorflow.contrib.learn.python.learn.estimators.head._BinarySvmHead object at 0x7fa8efec4e10>, 'optimizer': <tensorflow.contrib.linear_optimizer.python.sdca_optimizer.SDCAOptimizer object at 0x7fa8efec44a8>, 'update_weights_hook': <tensorflow.contrib.learn.python.learn.estimators.linear._SdcaUpdateWeightsHook object at 0x7fa8efec41d0>})

## Evaluation

Create a predict_fn() to supply data to make predictions.  
Then call classifier.predict() to create y_pred.

In [22]:
# Complete code below this comment  (Question #P6006)
# ----------------------------------
def predict_fn():
    columns = {
        feature_name: tf.constant(np.expand_dims(X_test[:,i], 1))
            for i,feature_name in enumerate(feature_names)
    }
    columns['example_id'] = tf.constant([str(i+1) for i in range(len(X_test))])
    return columns

Then call classifier.predict() to create **y_pred** as predictions.

**Hint**: See LinearSVM lab.

In [23]:
# Add code below this comment  (Question #P6007)
# ----------------------------------


y_pred=classifier.predict(input_fn=predict_fn)





In [24]:
y_pred = list(map(lambda i: i['classes'], y_pred))

Feed predictions **y_pred** along with ground truth **y_test** to confusion_matrix() to create a confusion matrix.

In [25]:
help(confusion_matrix)

Help on function confusion_matrix in module sklearn.metrics.classification:

confusion_matrix(y_true, y_pred, labels=None, sample_weight=None)
    Compute confusion matrix to evaluate the accuracy of a classification
    
    By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
    is equal to the number of observations known to be in group :math:`i` but
    predicted to be in group :math:`j`.
    
    Thus in binary classification, the count of true negatives is
    :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is
    :math:`C_{1,1}` and false positives is :math:`C_{0,1}`.
    
    Read more in the :ref:`User Guide <confusion_matrix>`.
    
    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (correct) target values.
    
    y_pred : array, shape = [n_samples]
        Estimated targets as returned by a classifier.
    
    labels : array, shape = [n_classes], optional
        List of labels to index the m

In [26]:
# Add code below this comment  (Question #P6008)
# ----------------------------------


confusion_matrix(y_test, y_pred)




array([[158,  46],
       [ 54, 142]])

# Save your notebook!