In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
import gensim

### Load Data

In [7]:
# word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)
df = pd.read_csv('./data/sarcasm_dataset_clean.tsv', delimiter='\t')
df = df.sample(frac=1).reset_index(drop=True)

FileNotFoundError: [Errno 2] File b'./data/sarcasm_dataset_clean.tsv' does not exist: b'./data/sarcasm_dataset_clean.tsv'

### Word Embedding based features (Joshi)

In [33]:
# Normalization of input specific to extract joshi features
def joshi_normalize(df, lowercase=True ):
    # Tokenize the sentences
    wpt = nltk.WordPunctTokenizer()
    # Stop words list
    stop_words = nltk.corpus.stopwords.words('english')
    df = df.copy()
    # Iterate through pandas dataframe
    for i, row in df.iterrows():
        # row[1] contains the sentence string type = string
        new_str = re.sub(r'[^A-Za-z\s]', '', row[1], re.I|re.A)
        # lowercase (doesnt matter as we are extracting word embedding features only)
        if(lowercase):
            new_str = new_str.lower()
        new_str = new_str.strip()
        # tokenize string
        tokens = wpt.tokenize(new_str)
        # filter stop words
        filtered_tokens = [token for token in tokens if token not in stop_words]
        # join the tokens back as single string
        new_str = ' '.join(filtered_tokens)
        # update dataframe
        df.iat[i, 1] = new_str
    
    # Return dataframe with normalized text
    return df

# Calculate 8 Word Embedding Features both unweighted [0-3] and weighted[4-7]
# Param : 'df' should have text as 2nd column; 'model': gensim model for selected pretrained vectors
def joshi_features(df, model):
    # Tokenizer to tokenize input string
    wpt = nltk.WordPunctTokenizer()
    # final 8 features for every sentence stored as row
    features = []
    
    # As we modify df, we will make a copy of the object
    df = df.copy()
    for ind, row in df.iterrows():
        # extract string
        string = row[1]
        # tokenize
        tokens = wpt.tokenize(string)
        
        # Create sim_matrix 
        length = len(tokens)
        
        # Unweighted similarity matrix
        sim_matrix = np.zeros((length, length), dtype='float32')
        sim_matrix.fill(np.nan)
        
        # Weighted Sim Matrix
        wsim_matrix = np.zeros((length, length), dtype='float32')
        wsim_matrix.fill(np.nan)
        
        # Build Matrix
        for i in range(length):
            for j in range(i+1, length):
                try:
                    sim_matrix[i][j] = model.similarity(tokens[i], tokens[j])
                    if sim_matrix[i][j]==1 :
                        # To remove repeated words 
                        raise Exception
                    sim_matrix[j][i] = sim_matrix[i][j]
                    
                    wsim_matrix[i][j] = np.divide(sim_matrix[i][j], np.square(i-j))
                    wsim_matrix[j][i] = wsim_matrix[i][j]
                    
                except Exception as e:
                    # We fill nan if the word does not exist in model and ignore
                    sim_matrix[i][j] = np.nan
                    sim_matrix[j][i] = np.nan
                    wsim_matrix[i][j] = np.nan
                    wsim_matrix[j][i] = np.nan
        
        # Create dataframe for easier calulations
        sim_df = pd.DataFrame(sim_matrix)
        # Find most similar score for every word
        sim  = sim_df.max(axis=1)
        # Find most dissimilar score for every word
        dsim = sim_df.min(axis=1)
        
        # Create dataframe for easier calculations
        wsim_df = pd.DataFrame(wsim_matrix)
        # Find most similar score for every word
        wsim  = wsim_df.max(axis=1)
        # Find most dissimilar score for every word
        wdsim = wsim_df.min(axis=1)
        
        # Extract 8 features
        ff = [sim.max(axis=0), sim.min(axis=0), dsim.max(axis=0), dsim.min(axis=0), 
              wsim.max(axis=0), wsim.min(axis=0), wdsim.max(axis=0), wdsim.min(axis=0)]
        features.append(ff)
        
        # Print Progress
        print(ind)
    
    # Converting list to numpy array
    feat = np.array(features)
    # Converting numpy array to dataframe with appropriate column names
    feat_df = pd.DataFrame(feat, columns=['max_sim', 'min_sim', 'max_dsim', 'min_dsim', 
                                          'max_wsim', 'min_wsim', 'max_wdsim', 'min_wdsim' ])
    # Testing if nan values exist in final feature matrix
#      feat_df.isnull().any(1).nonzero()[0]
    return feat_df

### Gonzalez Features

In [4]:
# Load LIWC dictionary (32 feature columns + 1 word name) (4487 words)
liwc = pd.read_pickle('./data/liwc32.pkl')

NameError: name 'pd' is not defined

In [103]:
norm_df = joshi_normalize(df)

In [3]:
# extracts 64 feature vector(both incidence and frequence) for each sentence using liwc dictionary
# df needs to be cleaned text
# very basic implementation can be improved later, for now its slow
# returns 2 dataframe
# Portland stemmer to extract stems of words to match with LIWC dictionary
from nltk.stem.porter import *
def liwc_features(df, liwc):
    stemmer = PorterStemmer()
    wpt = nltk.WordPunctTokenizer()
    liwc_f = []
    liwc_i = []
    for ind, row in df.iterrows():
        string = row[1]
        tokens = wpt.tokenize(string)
        feat = []

        for tok in tokens:
            tok_stem = stemmer.stem(tok)
            tok_stem += '*'
            if(tok=='wrote'):
                continue
            if (liwc.index == tok).any():
                temp = liwc.loc[tok]
                temp = list(temp)
                feat.append(temp)
            elif (liwc.index == tok_stem).any():
                temp = liwc.loc[tok_stem]
                temp = list(temp)
                feat.append(temp)
        if(len(feat)==0):
            for i in range(0,32):
                feat.append(0)
            freq = feat
            inci = feat
            liwc_f.append(freq)
            liwc_i.append(inci)
            print(ind)
            continue
        temp_df = pd.DataFrame(feat)
        freq = temp_df.sum(axis=0).to_list()
        inci = []
        for i in freq:
            if i>=1:
                inci.append(1)
            else:
                inci.append(0)
        liwc_f.append(freq)
        liwc_i.append(inci)
        print(ind)
    feature_names = list(liwc.columns.values)
    liwcf_df = pd.DataFrame(liwc_f, columns=feature_names)
    liwci_df = pd.DataFrame(liwc_i, columns=feature_names)
    
    return liwcf_df, liwci_df

In [2]:
fdf, idf = liwc_features(df, liwc)

NameError: name 'liwc_features' is not defined

In [106]:
if(len(fdf.isnull().any(1).to_numpy().nonzero()[0])==0):
    print('No Problems')
if(len(idf.isnull().any(1).to_numpy().nonzero()[0])==0):
    print('No Problems')

No Problems
No Problems


In [107]:
# Save the feature dataframes
fdf.to_pickle('./data/liwc_f.pkl')
idf.to_pickle('./data/liwc_i.pkl')

In [110]:
# Chi2 test to find the most important features
from sklearn.feature_selection import chi2
# Get class labels
labels = df['label']
labels = list(labels)
# chi2 test
x_f, pval1 = chi2(fdf, df['label'])
x_i, pval2 = chi2(idf, df['label'])
# create dataframe for frequency
chi_val = []
chi_val.append(list(x_f))
chi_val.append(list(pval1))
chi2_fdf = pd.DataFrame(chi_val, columns=liwc.columns.values, index=['chi2', 'pval'])
#c create dataframe for incidence
chi_val = []
chi_val.append(list(x_i))
chi_val.append(list(pval2))
chi2_idf = pd.DataFrame(chi_val, columns=liwc.columns.values, index=['chi2', 'pval'])
# 10 Important features
gonz_liwc_f = list(chi2_fdf.loc['pval'].nlargest(10).index)
gonz_liwc_i = list(chi2_idf.loc['pval'].nlargest(10).index)

gonz_liwc_f = fdf[gonz_liwc_f]
gonz_liwc_i = idf[gonz_liwc_i]

# Change Names
old_names = list(gonz_liwc_f.columns)
new_names = []
for old in old_names:
    new_names.append(old+'{Frequency}')
gonz_liwc_f.columns = new_names

# Change Names
old_names = list(gonz_liwc_i.columns)
new_names = []
for old in old_names:
    new_names.append(old+'{Frequency}')
gonz_liwc_i.columns = new_names

['sexual', 'motion', 'negemo', 'bio', 'anx', 'friend', 'ingest', 'space', 'social', 'anger']


### Create four feature setups

In [111]:
# Get unigram features
modified_data = joshi_normalize(df, lowercase=False)

In [112]:
cv = CountVectorizer(min_df=2, max_df=1.)
cv_matrix = cv.fit_transform(modified_data['text'])
cv_matrix = cv_matrix.toarray()
vocab = cv.get_feature_names()
unigrams = pd.DataFrame(cv_matrix, columns=vocab)
gonz_features = pd.concat([unigrams, gonz_liwc_f, gonz_liwc_i], axis=1)
gonz_features['ID'] = range(0,3756)

In [78]:
gonz_features[0,1]

TypeError: Cannot convert bool to numpy.ndarray

In [119]:
from sklearn import svm
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
model = svm.SVC(gamma='scale', class_weight='balanced', C=20.0, cache_size=1000)
# 10 is pseudo random number
kfold = KFold(5, True, 100)
scores = []
data = gonz_features
def classify():
    for train, test in kfold.split(data):
        X_train, X_test, y_train, y_test = data.iloc[train], data.iloc[test], df['label'][train], df['label'][test]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        metric = precision_recall_fscore_support(y_test, y_pred)
        print(y_pred)
        print(y_test)
        print(len(y_pred))
        print(len(y_test))
        print(metric)
        print("Done Partly")
        scores.append(metric)
        break
%timeit classify()
    

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [121]:
model.support_vectors

AttributeError: 'SVC' object has no attribute 'support_vectors'