###  Created by Luis Alejandro (alejand@umich.edu)
Applies the Mutual Information (MI) ranking criterion to the text emotions dataset

In [1]:
import pandas as pd
import re
import time
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

import sys
sys.path.append('../')
from utils.nlp.vectorizer import Vectorizer
from utils.feature_selection.mutual import MutualInfo
from utils.feature_selection.reports import report_feature_ranking

In [2]:
# Importing the dataset
dataset = pd.read_csv('../../datasets/classification/text_emotions/train_data.csv')
corpus = dataset.iloc[:,1].values
responses = dataset.iloc[:,0].values
print(np.unique(responses))
lc = LabelEncoder()
responses = lc.fit_transform(responses)
min_freq = 20
max_freq = 10000
# Conditioning text and generates dictionary of word
dictionary = dict()
for i in range(len(corpus)):
    corpus[i] = corpus[i].lower()
    corpus[i] = re.sub(r'[^a-z\'\s]',' ',corpus[i])
    corpus[i] = re.sub(r'[^a-z]+[\']|[\'][^a-z]+',' ',corpus[i])

['anger' 'boredom' 'empty' 'enthusiasm' 'fun' 'happiness' 'hate' 'love'
 'neutral' 'relief' 'sadness' 'surprise' 'worry']


In [3]:
def extract_features_custom (corpus):
    start = time.perf_counter()
    # ------------------------------------------------------
    cv = Vectorizer(corpus,min_freq=min_freq,max_freq=max_freq)
    X = cv.fit()
    # ------------------------------------------------------   
    end = time.perf_counter()
    print(end - start, ' seconds')

    # Converts matrix to nparray
    X = X.toarray()
    
    # Outputs validation
    cv.output_validation()
    
    return X,cv

In [4]:
# Extracts the features using built-in funciton in sklearn
def extract_features_sklearn (corpus):
    start = time.perf_counter()
    # ------------------------------------------------------
    cv = CountVectorizer(min_df = min_freq + 1, max_df = max_freq - 1)
    X = cv.fit_transform(corpus)
    # ------------------------------------------------------   
    end = time.perf_counter()
    print(end - start, ' seconds')

    # Converts matrix to nparray
    X = X.toarray()

    # Verifies data matrix
    print(sum(X[0,:]))
    print(corpus[0])
    
    return X,cv

In [5]:
# Split dataset into train and test sets
predictors,cv = extract_features_sklearn(corpus)
print(predictors.shape)
predictors,cv = extract_features_custom(corpus)
print(predictors.shape)
predictors_filtered = predictors
#predictors_filtered[predictors_filtered > 1] = 1
X, X_holdout, y, y_holdout = train_test_split(predictors_filtered,responses,test_size = 0.20,stratify=responses)

0.39993029999999985  seconds
11
 tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part   
(30000, 1579)
0.3602251000000001  seconds
143
item: achieving a new appreciation on how a xml build script can really be painful and cumbersome
column: [ 13 139  15 115 356 327  46 520   4]
data: [2 1 1 1 1 1 1 1 1]
a
new
on
how
can
really
be
painful
and
(30000, 1640)


In [6]:
# Computes MI with custom implementation
start = time.perf_counter()
mi = MutualInfo(X,y,n_jobs=4)
mi.compute()
end = time.perf_counter()
print(end - start, ' seconds')

Using parallel version
3.2090814  seconds


In [7]:
# Reports result
report_feature_ranking(mi.info, cv.words, 20)

Feature ranked 1 is (love) with value 0.029847
Feature ranked 2 is (sad) with value 0.019956
Feature ranked 3 is (hate) with value 0.018577
Feature ranked 4 is (happy) with value 0.011805
Feature ranked 5 is (thanks) with value 0.010093
Feature ranked 6 is (miss) with value 0.009972
Feature ranked 7 is (good) with value 0.008407
Feature ranked 8 is (sorry) with value 0.008406
Feature ranked 9 is (you) with value 0.008103
Feature ranked 10 is (my) with value 0.007829
.
.
.

Feature ranked 1631 is (drink) with value 0.000150
Feature ranked 1632 is (living) with value 0.000148
Feature ranked 1633 is (wanting) with value 0.000132
Feature ranked 1634 is (action) with value 0.000130
Feature ranked 1635 is (thursday) with value 0.000124
Feature ranked 1636 is (article) with value 0.000120
Feature ranked 1637 is (sent) with value 0.000117
Feature ranked 1638 is (camera) with value 0.000109
Feature ranked 1639 is (street) with value 0.000107
Feature ranked 1640 is (decided) with value 0.000101


In [8]:
# Computes MI with sklearn implementation
start = time.perf_counter()
mi = mutual_info_classif(X,y)
end = time.perf_counter()
print('Elpased time:', end-start)

Elpased time: 309.67578210000005


In [9]:
# Reports result
report_feature_ranking(mi, cv.words, 20)

Feature ranked 1 is (sad) with value 0.016675
Feature ranked 2 is (hate) with value 0.016635
Feature ranked 3 is (love) with value 0.015798
Feature ranked 4 is (bye) with value 0.013701
Feature ranked 5 is (thats) with value 0.012516
Feature ranked 6 is (pay) with value 0.012277
Feature ranked 7 is (always) with value 0.012038
Feature ranked 8 is (breakfast) with value 0.011655
Feature ranked 9 is (stupid) with value 0.011629
Feature ranked 10 is (out) with value 0.011206
.
.
.

Feature ranked 1631 is (chinese) with value 0.000000
Feature ranked 1632 is (parents) with value 0.000000
Feature ranked 1633 is (raining) with value 0.000000
Feature ranked 1634 is (videos) with value 0.000000
Feature ranked 1635 is (j) with value 0.000000
Feature ranked 1636 is (happens) with value 0.000000
Feature ranked 1637 is (lived) with value 0.000000
Feature ranked 1638 is (yours) with value 0.000000
Feature ranked 1639 is (cd) with value 0.000000
Feature ranked 1640 is (know) with value 0.000000
