###  Created by Luis A. Sanchez-Perez (alejand@umich.edu).
<p><span style="color:green"><b>Copyright &#169;</b> Do not distribute or use without authorization from author.</span></p>

Applies the Mutual Information (MI) ranking criterion to the text emotions dataset

In [1]:
import pandas as pd
import re
import time
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from utils.vectorizer import Vectorizer
from utils.mutual import MutualInfo
from utils.reports import report_feature_ranking

In [2]:
# Importing the dataset
dataset = pd.read_csv('E:/datasets/classification/text_emotions/train_data.csv')
corpus = dataset.iloc[:,1].values
responses = dataset.iloc[:,0].values
print(np.unique(responses))
lc = LabelEncoder()
responses = lc.fit_transform(responses)
min_freq = 20
max_freq = 10000
# Conditioning text and generates dictionary of word
dictionary = dict()
for i in range(len(corpus)):
    corpus[i] = corpus[i].lower()
    corpus[i] = re.sub(r'[^a-z\'\s]',' ',corpus[i])
    corpus[i] = re.sub(r'[^a-z]+[\']|[\'][^a-z]+',' ',corpus[i])

['anger' 'boredom' 'empty' 'enthusiasm' 'fun' 'happiness' 'hate' 'love'
 'neutral' 'relief' 'sadness' 'surprise' 'worry']


In [3]:
def extract_features_custom (corpus):
    start = time.perf_counter()
    # ------------------------------------------------------
    cv = Vectorizer(corpus, min_freq=min_freq, max_freq=max_freq)
    X = cv.fit()
    # ------------------------------------------------------   
    end = time.perf_counter()
    print(end - start, ' seconds')
    # Converts matrix to nparray
    X = X.toarray()
    # Outputs validation
    cv.output_validation()
    return X,cv

In [4]:
# Extracts the features using built-in funciton in sklearn
def extract_features_sklearn (corpus):
    start = time.perf_counter()
    # ------------------------------------------------------
    cv = CountVectorizer(min_df=min_freq + 1, max_df=max_freq - 1)
    X = cv.fit_transform(corpus)
    # ------------------------------------------------------   
    end = time.perf_counter()
    print(end - start, ' seconds')
    # Converts matrix to nparray
    X = X.toarray()
    # Verifies data matrix
    print(sum(X[0,:]))
    print(corpus[0])
    return X,cv

In [5]:
# Split dataset into train and test sets
predictors,cv = extract_features_sklearn(corpus)
print(predictors.shape)
predictors,cv = extract_features_custom(corpus)
print(predictors.shape)
predictors_filtered = predictors
#predictors_filtered[predictors_filtered > 1] = 1
X, X_holdout, y, y_holdout = train_test_split(predictors_filtered,responses,test_size = 0.20, stratify=responses)

0.41588860000000016  seconds
11
 tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part   
(30000, 1579)
0.3711934000000001  seconds
1476
item: crash in qmbol
column: [1012   89]
data: [1 1]
crash
in
(30000, 1640)


In [6]:
# Computes MI with custom implementation
start = time.perf_counter()
mi = MutualInfo(X,y,n_jobs=4)
mi.compute()
end = time.perf_counter()
print(end - start, ' seconds')

Using parallel version
3.0801088000000005  seconds


In [7]:
# Reports result
report_feature_ranking(mi.info, cv.words, 20)

Feature ranked 1 is 'love' with value 2.85E-02
Feature ranked 2 is 'hate' with value 1.94E-02
Feature ranked 3 is 'sad' with value 1.92E-02
Feature ranked 4 is 'thanks' with value 1.11E-02
Feature ranked 5 is 'happy' with value 1.07E-02
Feature ranked 6 is 'miss' with value 9.71E-03
Feature ranked 7 is 'good' with value 8.71E-03
Feature ranked 8 is 'my' with value 8.63E-03
Feature ranked 9 is 'sorry' with value 7.60E-03
Feature ranked 10 is 'great' with value 7.52E-03
.
.
.

Feature ranked 1631 is 'sunday' with value 1.59E-04
Feature ranked 1632 is 'training' with value 1.58E-04
Feature ranked 1633 is 'street' with value 1.55E-04
Feature ranked 1634 is 'fight' with value 1.51E-04
Feature ranked 1635 is 'men' with value 1.47E-04
Feature ranked 1636 is 'six' with value 1.37E-04
Feature ranked 1637 is 'bc' with value 1.30E-04
Feature ranked 1638 is 'gd' with value 1.20E-04
Feature ranked 1639 is 'wanting' with value 1.10E-04
Feature ranked 1640 is 'sent' with value 8.10E-05


In [8]:
# Computes MI with sklearn implementation
start = time.perf_counter()
mi = mutual_info_classif(X,y)
end = time.perf_counter()
print('Elpased time:', end-start)

Elpased time: 335.9162398


In [9]:
# Reports result
report_feature_ranking(mi, cv.words, 20)

Feature ranked 1 is 'miss' with value 1.79E-02
Feature ranked 2 is 'sad' with value 1.75E-02
Feature ranked 3 is 'often' with value 1.31E-02
Feature ranked 4 is 'america' with value 1.20E-02
Feature ranked 5 is 'x' with value 1.20E-02
Feature ranked 6 is 'not' with value 1.16E-02
Feature ranked 7 is 'moment' with value 1.16E-02
Feature ranked 8 is 'thank' with value 1.15E-02
Feature ranked 9 is 'neither' with value 1.14E-02
Feature ranked 10 is 'facebook' with value 1.13E-02
.
.
.

Feature ranked 1631 is 'staying' with value 0.00E+00
Feature ranked 1632 is 'word' with value 0.00E+00
Feature ranked 1633 is 'somebody' with value 0.00E+00
Feature ranked 1634 is 'smh' with value 0.00E+00
Feature ranked 1635 is 'coming' with value 0.00E+00
Feature ranked 1636 is 'ddlovato' with value 0.00E+00
Feature ranked 1637 is 'library' with value 0.00E+00
Feature ranked 1638 is 'stopped' with value 0.00E+00
Feature ranked 1639 is 'huh' with value 0.00E+00
Feature ranked 1640 is 'know' with value 0.00E