In [5]:
import pandas as pd 
import numpy as np
from nltk import WordPunctTokenizer
from nltk import TreebankWordTokenizer
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
from pprint import pprint

In [6]:
# open dataset and preprocess
data = pd.read_csv('sarcasm_dataset_clean.csv')          # import the data
labels = data['label'].tolist()                          # extracts the labels as a list 
text = data['text'].tolist()                             # extracts the text to be processed as a list

In [7]:
# tok = TreebankWordTokenizer()
# tokenizer=tok.tokenize, use this if the results are poor
minfreq = 3                                              # Used to prune those terms (uni-|bi-|tri- grams whose frequency is less than this)

In [8]:
print(text[0])

Do you remember me telling you we are practicing non-verbal spells, Potter?" "Yes," said Harry stiffly. "Yes, sir." "There's no need to call me "sir" Professor." The words had escaped him before he knew what he was saying.  '


In [34]:
def process_data(text,labels,n_gram,tokenizer=None,min_document_frequency=2,minfreq=3):
    '''
        Tokenizes the dataset and return the term-frequency matrix of the frequent n-grams 
        
        Parameters
        -----------
        text : list
            List of strings containing the text to be processed.
        n_gram: int
            The size of the n-gram to be returned
        tokenizer:
            String tokenizer to be used
        min_document_frequency: 
            The minimum document frequency to be used to prune n-grams
        minfreq: 
            The minimum frequency to be used to prune n-grams
            
        Returns
        -------
        dataframe : Pandas DataFrame
            The text converted as a term-frequency matrix
        n_grams : list
            The list of n-grams that were generated
        weights: tuple 
            p_value , chi2 value 
    '''
    vectorizer = CountVectorizer(tokenizer=tokenizer,lowercase=False,\
                                 ngram_range=(n_gram,n_gram),\
                                 min_df=min_document_frequency,\
                                 stop_words='english',\
                                 token_pattern='\\w+')                     
    processed_data = (vectorizer.fit_transform(text))                      
    processed_data = processed_data.toarray()                              
    
    n_grams = vectorizer.get_feature_names()                               
    
    counts = np.sum(processed_data,axis=0)                                 
    indices_to_keep = (np.argwhere(counts > minfreq)).flatten()            
    processed_data = processed_data[:,indices_to_keep]                       
    
    dataframe = pd.DataFrame(processed_data)
    
    weights = chi2(processed_data,labels)
    
    return dataframe,n_grams,weights

In [42]:
unigrams_matrix,unigrams,(unigram_p,unigram_chi) = process_data(text,labels,1)
unigram_features = unigrams_matrix*unigram_chi

In [41]:
bigrams_matrix,bigrams,(bigram_p,bigram_chi) = process_data(text,labels,2)
bigram_features = bigrams_matrix*bigram_chi

In [40]:
trigrams_matrix,trigrams,(trigram_p,trigram_chi) = process_data(text,labels,3)
trigram_features = trigrams_matrix*trigram_chi