# Import libraries

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords #used in stopwords removal
stop_wrds=stopwords.words('english')
stop_wrds.remove('not')

from sklearn.feature_extraction.text import TfidfVectorizer

from collections import Counter
from scipy import stats
import scipy.stats as ss

import warnings
warnings.filterwarnings("ignore")

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.floa

# Vectorization

In [2]:
class Vectorization():
    #this class creates TFIDF vectors for the given set of documents
    
    def create_TFIDF_vectors(self, documents, n_grams, maximum_df=1.0, minimum_df=1, maximum_features=2000):
        '''this function returns creates TFIDF scores matrix and features'''
        if maximum_features != None:
            vectorizer = TfidfVectorizer(stop_words=stop_wrds, ngram_range=n_grams, max_df=maximum_df, min_df=minimum_df, max_features=maximum_features)
        else:
            vectorizer = TfidfVectorizer(stop_words=stop_wrds, ngram_range=n_grams)
            
        #vectorizer = TfidfVectorizer(stop_words=stop_wrds, ngram_range=(1, 1))
        tfidf_matrix=vectorizer.fit_transform(documents)

        feature_index = [tfidf_matrix[i,:].nonzero()[1] for i in range(len(documents))]

        feature_names=vectorizer.get_feature_names()
        features = [[feature_names[j] for j in i] for i in feature_index]

        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = vectorizer.get_feature_names())
        #tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])

        return tfidf_matrix,feature_index,features,feature_names,tfidf_df


    def get_feature_data(self, data, n_grams, max_df, min_df, max_features):
        '''this function returns features and their corresponding tfidf scores along with the target class'''
    
        classes=list(data['sentiment_class'])
        reviews=list(data['text'])

        tfidf_matrix,feature_index,features,feature_names,tfidf_df = self.create_TFIDF_vectors(reviews, n_grams, max_df, min_df, max_features)
        features_df = pd.DataFrame(feature_names,columns=['features'])
        #tfidf_df.to_csv(obj_dict['output_data_path'] + '/tfidf_scores.csv',index=False,header=tfidf_df.columns)

        reviews_df = pd.DataFrame(reviews,columns=['review'])
        classes_df = pd.DataFrame(classes,columns=['class'])

        feature_data_df=pd.concat([reviews_df,tfidf_df,classes_df],axis=1)
        #feature_data_df.to_csv('results/feature_data.csv',index=False,header=feature_data_df.columns)

        return feature_data_df, tfidf_df  
    
    
    def split_X_and_y(self, tfidf_df, feature_data_df):
        ''' splits the target class column from columns of tfidf scores'''
        y = pd.get_dummies(feature_data_df['class'], prefix = 'class')
        if 'class' in y.columns.to_list():
            y=y.drop(columns=['class'],axis=1)
        X = tfidf_df
        return X,y

# Odds Ratio

In [3]:
class OddsRatio():
    # this class computes odds ratio
    
    def find_odds_ratio(self, X, containing, target_class):
        ''' for a given target class (say, positive) this method computes the odds ratio of all features wrt the given target class'''
        
        odds_ratio_dict = {}
        features = list(X.columns)
        target_values = target_class.tolist()
        for feature_index in range(len(features)):
            a= list(X.iloc[:,feature_index])
            b=containing[features[feature_index]]
            
            p = len([i for i in b if target_class[i]==1]) #no. of positive reviews with the feature word
            q = len(b) #no. of reviews containing feature word
            r = target_values.count(1) - p
            s=len(a)-len(b) #no. of reviews not containing feature word

            odds_ratio = (p/q) / (r/s)
            odds_ratio_dict[X.columns[feature_index]]=odds_ratio

        return odds_ratio_dict
    
    def odds_ratio_for_all_classes(self, X, y, containing):
        ''' this method computes the odds ratio for all the classes individually'''
        
        odds_ratio_df = pd.DataFrame()
        odds_ratio_df = pd.concat([odds_ratio_df,pd.DataFrame(X.columns.to_list())], axis=1)
        
        for i in range(len(y.columns)):
            odds_ratio = self.find_odds_ratio(X,containing,y.iloc[:,i])
            odds = pd.DataFrame(odds_ratio.values())
            odds_ratio_df = pd.concat([odds_ratio_df, odds], axis=1)
        
        odds_ratio_df.columns = ['feature_word'] + y.columns.to_list()
        
        return odds_ratio_df

# Feature Selection methods

In [4]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

class Features():
    ''' this class has different methods of feature selection'''
    
    # PEARSON CORRELATION
    def cor_selector(self, X, y,num_feats):
        ''' this method computes the pearson features for a given target class'''
        
        cor_list = []
        feature_name = X.columns.tolist()

        for i in X.columns.tolist(): # calculate the correlation with y for each feature
            cor = np.corrcoef(X[i], y)[0, 1]
            cor_list.append(cor)
       
        cor_list = [0 if np.isnan(i) else i for i in cor_list]  # replace NaN with 0
        cor_feature = X.iloc[:,np.argsort(cor_list)[-num_feats:]].columns.tolist()[::-1] # feature name
        cor_support = [True if i in cor_feature else False for i in feature_name] # feature selection? 0 for not select, 1 for select

        return cor_support, cor_feature, cor_list
    
    def getPearsonCorrelationFeatures(self, X, y, num_feats,frequency_dict):
        ''' this method returns the top num_feats pearson features from all the features'''
        
        pearson_features_df = pd.DataFrame()
            
        for i in range(len(y.columns)):
            cor_support, cor_feature, cor_list = self.cor_selector(X,y.iloc[:,i],num_feats)
            cor_feature_df = pd.DataFrame(cor_feature)
            
            frequency_list =[]
            for feature in cor_feature:
                frequency_list.append(frequency_dict[feature])
            frequency_df = pd.DataFrame(frequency_list)
            pearson_features_df = pd.concat([pearson_features_df, cor_feature_df, frequency_df],axis=1)
            
        #pearson_features_df.columns = y.columns.to_list()
        columns_list = []
        for column in y.columns.to_list():
            columns_list.append(column)
            columns_list.append(column+'_frequency')
            
        pearson_features_df.columns = columns_list
        
        return pearson_features_df
    
    # CHI SQUARE FEATURES
    def getChiSquaredFeatures(self,X,y,no_features):
        ''' this method computes the chi square features for a given target class'''
        
        X_norm = MinMaxScaler().fit_transform(X)
        
        chi_selector = SelectKBest(chi2, k='all')
        chi_selector.fit(X_norm, y)

        chi_support = chi_selector.get_support()
        chi_feature = X.loc[:,chi_support].columns.tolist()

        return chi_support, chi_feature
    
    def getChiSquaredCorrelationFeatures(self,X,y,num_feats,frequency_dict):
        ''' this method returns the top num_feats chi square features from all the features'''
        
        chisquare_features_df = pd.DataFrame()
        
        for i in range(len(y.columns)):
            chi_support, chi_feature = self.getChiSquaredFeatures(X,y.iloc[:,i],num_feats)
            chi_feature_df = pd.DataFrame(chi_feature)
            
            frequency_list =[]
            for feature in chi_feature:
                frequency_list.append(frequency_dict[feature])
            frequency_df = pd.DataFrame(frequency_list)
            
            chisquare_features_df = pd.concat([chisquare_features_df, chi_feature_df,frequency_df],axis=1)
        
        #chisquare_features_df.columns = y.columns.to_list()
        columns_list = []
        for column in y.columns.to_list():
            columns_list.append(column)
            columns_list.append(column+'_frequency')
            
        chisquare_features_df.columns = columns_list
        
        return chisquare_features_df
    
    # RECURSIVE FEATURE ELIMINATION (RFE) FEATURES
    def findRFEFeatures(self,X,y,no_features):
        ''' this method computes the Recursive Feature Elimination (RFE) features for a given target class'''
        
        X_norm = MinMaxScaler().fit_transform(X)

        rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=no_features, step=100, verbose=0)
        rfe_selector.fit(X_norm, y)
        rfe_support = rfe_selector.get_support()
        rfe_feature = X.loc[:,rfe_support].columns.tolist()

        return rfe_support, rfe_feature
    
    def getRFEFeatures(self,X,y,num_feats,frequency_dict):
        ''' this method returns the top num_feats Recursive Feature Elimination (RFE) features from all the features'''
        
        rfe_features_df = pd.DataFrame()
            
        for i in range(len(y.columns)):
            rfe_support, rfe_feature = self.findRFEFeatures(X,y.iloc[:,i],num_feats)
            rfe_feature_df = pd.DataFrame(rfe_feature)
            frequency_list =[]
            for feature in rfe_feature:
                frequency_list.append(frequency_dict[feature])
            frequency_df = pd.DataFrame(frequency_list)
            rfe_features_df = pd.concat([rfe_features_df, rfe_feature_df,frequency_df],axis=1)
        
        #rfe_features_df.columns = y.columns.to_list()
        columns_list = []
        for column in y.columns.to_list():
            columns_list.append(column)
            columns_list.append(column+'_frequency')
            
        rfe_features_df.columns = columns_list
        
        return rfe_features_df
    
    # LASSO REGRESSION (LR) FEATURES
    def findLRFeatures(self,X,y,no_features):
        ''' this method computes the lasso features for a given target class'''
        
        X_norm = MinMaxScaler().fit_transform(X)

        embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), max_features=no_features)
        embeded_lr_selector.fit(X_norm, y)

        embeded_lr_support = embeded_lr_selector.get_support()
        embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()

        return embeded_lr_support, embeded_lr_feature
    
    def getLRFeatures(self,X,y,num_feats,frequency_dict):
        ''' this method returns the top num_feats Lasso Regression (LR) features from all the features'''
        
        lr_features_df = pd.DataFrame()
            
        for i in range(len(y.columns)):
            lr_support, lr_feature = self.findLRFeatures(X,y.iloc[:,i],num_feats)
            lr_feature_df = pd.DataFrame(lr_feature)
            frequency_list =[]
            for feature in lr_feature:
                frequency_list.append(frequency_dict[feature])
            frequency_df = pd.DataFrame(frequency_list)
            lr_features_df = pd.concat([lr_features_df, lr_feature_df,frequency_df],axis=1)
        
        #lr_features_df.columns = y.columns.to_list()
        columns_list = []
        for column in y.columns.to_list():
            columns_list.append(column)
            columns_list.append(column+'_frequency')
            
        lr_features_df.columns = columns_list
        
        return lr_features_df
    
    # RANDOM FOREST CLASSIFIER (RFC) FEATURES
    def findRFCFeatures(self,X,y,no_features):
        ''' this method computes the random forest classifier features for a given target class'''
        
        embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=10), max_features=no_features)
        embeded_rf_selector.fit(X, y)

        embeded_rf_support = embeded_rf_selector.get_support()
        embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()

        return embeded_rf_support, embeded_rf_feature
    
    def getRFCFeatures(self,X,y,num_feats,frequency_dict):
        ''' this method returns the top num_feats Random Forest Classifier (RFC) features from all the features'''
        
        rfc_features_df = pd.DataFrame()
            
        for i in range(len(y.columns)):
            rfc_support, rfc_feature = self.findRFCFeatures(X,y.iloc[:,i],num_feats)
            rfc_feature_df = pd.DataFrame(rfc_feature)
            frequency_list =[]
            for feature in rfc_feature:
                frequency_list.append(frequency_dict[feature])
            frequency_df = pd.DataFrame(frequency_list)
            rfc_features_df = pd.concat([rfc_features_df, rfc_feature_df,frequency_df],axis=1)
        
        #rfc_features_df.columns = y.columns.to_list()
        columns_list = []
        for column in y.columns.to_list():
            columns_list.append(column)
            columns_list.append(column+'_frequency')
            
        rfc_features_df.columns = columns_list
        
        return rfc_features_df

# Feature Selection execution

In [5]:
class FeatureSelection():
    ''' this class finds features from data using different methods '''
    
    def __init__(self, object_dict):
        
        self.input_data_path = object_dict['input_data_path']
        self.output_data_path = object_dict['output_data_path']
        
        self.input_filename = object_dict['input_filename']
        self.raw_data = object_dict['raw_data']
        
        #print(object_dict['preprocessed_data'])
        self.preprocessed_data = pd.read_csv(object_dict['preprocessed_data'])
        
        if not os.path.exists(object_dict['preprocessed_data']):
            raise Exception('------Perform preprocessing before performing Feature Selection!-----')
            
            
        self.number_of_features = int(object_dict['no_of_features'])
        import ast
        self.n_gram_range = ast.literal_eval(object_dict['n_gram_range'])
        
        self.max_df = ast.literal_eval(object_dict['max_df'])
        self.min_df = ast.literal_eval(object_dict['min_df'])
        self.max_features = ast.literal_eval(object_dict['max_features'])
        
        vectorization = Vectorization()
        
        self.feature_data_df, self.tfidf_df = vectorization.get_feature_data(self.preprocessed_data, self.n_gram_range,self.max_df, self.min_df, self.max_features)
        
        self.tfidf_df.to_csv(object_dict['output_data_path'] + '/tfidf_scores.csv',index=False,header=self.tfidf_df.columns)
        self.feature_data_df.to_csv(object_dict['output_data_path'] + '/feature_data.csv',index=False,header=self.feature_data_df.columns)
        
        self.X, self.y = vectorization.split_X_and_y(self.tfidf_df, self.feature_data_df)
        print(len(self.X.columns),self.max_features)
        
        self.indices = self.store_indices(self.X)
        self.frequency_dict = self.count_frequency(self.indices,object_dict)
        
        if len(self.X.columns)<int(self.max_features):
            print('error')
            #raise Exception('---- Selected number of features exceeded the total number of features ----\n Total features : %s \nMax features : %s'%len(self.X.columns) %self.max_features)
        
        features = Features()
        if object_dict['pearson']=='True':
            self.pearson_features_df = features.getPearsonCorrelationFeatures(self.X, self.y, self.number_of_features, self.frequency_dict)
            self.pearson_features_df.to_csv(object_dict['output_data_path'] + 'pearson_features.csv',index=False)
        
        if object_dict['chi_square']=='True':
            self.chisquare_features_df = features.getChiSquaredCorrelationFeatures(self.X, self.y, self.number_of_features, self.frequency_dict)
            self.chisquare_features_df.to_csv(object_dict['output_data_path'] + 'chisquare_features.csv',index=False)
        
        if object_dict['rfe']=='True':
            self.rfe_features_df = features.getRFEFeatures(self.X, self.y, self.number_of_features, self.frequency_dict)
            self.rfe_features_df.to_csv(object_dict['output_data_path'] + 'rfe_features.csv',index=False)
        
        if object_dict['lr']=='True':
            self.lr_features_df = features.getLRFeatures(self.X, self.y, self.number_of_features, self.frequency_dict)
            self.lr_features_df.to_csv(object_dict['output_data_path'] + 'lr_features.csv',index=False)
        
        if object_dict['rfc']=='True':
            self.rfc_features_df = features.getRFCFeatures(self.X, self.y, self.number_of_features, self.frequency_dict)
            self.rfc_features_df.to_csv(object_dict['output_data_path'] + 'rfc_features.csv',index=False)
        
        odds_ratio = OddsRatio()
        
        if object_dict['odds_ratio']=='True':
            self.odds_ratio_df = odds_ratio.odds_ratio_for_all_classes(self.X, self.y, self.indices)
            self.odds_ratio_df.to_csv(object_dict['output_data_path'] + 'odds_ratio.csv',index = False)
        
    def store_indices(self,X):
        ''' this method finds the indices of documents containing each feature
            eg - indices['beautiful'] = [2,43,87,92]
        '''
        
        indices = {}
        features = list(X.columns)
        for feature_index in range(len(features)):
            a = list(X.iloc[:,feature_index])
            containing = [i for i,x in enumerate(a) if x!=0]
            indices[features[feature_index]] = containing
            
        return indices
    
    def count_frequency(self,containing,obj_dict):
        ''' this method counts the occurences of each feature in the corpus'''
        
        features = list(pd.read_csv(object_dict['output_data_path'] + 'tfidf_scores.csv').columns)
        reviews = list(pd.read_csv(object_dict['preprocessed_data'])['text'])
        
        counts_dict = {}
        for i in features:
            indices = containing[i]
            count =0
            for j in indices:
                count+=reviews[j].count(i)
            counts_dict[i] = count

        df = pd.DataFrame.from_dict(counts_dict,orient='index')
        words_df = pd.DataFrame(counts_dict.keys())
        counts_df = pd.DataFrame(counts_dict.values())

        frequency_df = pd.concat([words_df, counts_df],axis=1)
        frequency_df.columns = ['word','frequency']
        frequency_df.to_csv(object_dict['output_data_path'] + object_dict['raw_data'].split('.')[0]+'_word_frequency.csv',index=False)
        
        return counts_dict

# Create Configuration files

In [6]:
%run ../conf/FeatureSelectionConfiguration.py 
# this creates the .ini file for feature selection

# Main function

In [7]:
if __name__ == '__main__':
    from configparser import ConfigParser
    
    config = ConfigParser()
    config.read('../conf/feature_selection.ini') #read feature selection configuration file
    
    config2 = ConfigParser()
    config2.read('../conf/Collocations.ini') #read collocations configuration file
    
    #read the values from configuration and store in a dictionary for further usage
    object_dict = dict()

    object_dict['input_data_path'] = config['FeatureSelection']['input_data_path']
    object_dict['output_data_path'] = config['FeatureSelection']['output_data_path']
    
    object_dict['input_filename'] = config['FeatureSelection']['input_filename']
    object_dict['raw_data'] = config2['Collocations']['input_filename']
    
    object_dict['preprocessed_data'] = object_dict['input_data_path'] + object_dict['raw_data'].split('.')[0] + '_PreProcessed.csv'
    
    object_dict['no_of_features'] = config['FeatureSelection']['number_of_features']
    
    object_dict['n_gram_range'] = config['FeatureSelection']['n_gram_range']
    object_dict['idf_weighing'] = config['FeatureSelection']['idf_weighing']
    object_dict['max_df'] = config['FeatureSelection']['max_df']
    object_dict['min_df'] = config['FeatureSelection']['min_df']
    object_dict['max_features'] = config['FeatureSelection']['max_features']
    
    object_dict['pearson'] = config['FeatureSelection']['pearson_correlation']
    object_dict['chi_square'] = config['FeatureSelection']['chi_square_correlation']
    object_dict['rfe'] = config['FeatureSelection']['recursive_feature_elimination']
    object_dict['lr'] = config['FeatureSelection']['lasso_regression']
    object_dict['rfc'] = config['FeatureSelection']['random_forest_classifier']
    object_dict['odds_ratio'] = config['FeatureSelection']['odds_ratio']
    
    print(object_dict)
    
    import os
    try:
        os.mkdir(object_dict['output_data_path'][:-1])
    except FileExistsError:
        pass
    
    result = FeatureSelection(object_dict) #perform feature selection

{'input_data_path': '../results/PreProcessing_Results/', 'output_data_path': '../results/FeatureSelection_Results/', 'input_filename': '_PreProcessed.csv', 'raw_data': 'IMDBtrain.csv', 'preprocessed_data': '../results/PreProcessing_Results/IMDBtrain_PreProcessed.csv', 'no_of_features': '200', 'n_gram_range': '(1, 2)', 'idf_weighing': 'False', 'max_df': '200', 'min_df': '1', 'max_features': '200', 'pearson': 'True', 'chi_square': 'True', 'rfe': 'True', 'lr': 'True', 'rfc': 'True', 'odds_ratio': 'True'}
200 200
