## Feature selection with text data

#### Import packages

In [5]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import os
import shutil

#### Read data

In [6]:
df = pd.read_csv("C:\\Users\\erinb\\OneDrive\\Documents\\Georgetown\\DSAN 5000\\dsan-5000-project-ebevec\\data\\01-modified-data\\data_clean2.csv")
txt = pd.read_csv('C:\\Users\\erinb\\OneDrive\\Documents\\Georgetown\\DSAN 5000\\dsan-5000-project-ebevec\\data\\01-modified-data\\data_syria_switzerland_clean.csv')

#### Reformat data 

In [7]:
# CONVERT Y TO NUMPY ARRAY
txt2=np.array(txt)

#### Vectorize text and reformat

In [8]:
# PARAMETERS TO CONTROL SIZE OF FEATURE SPACE WITH COUNT-VECTORIZER
# minDF = 0.01 means "ignore terms that appear in less than 1% of the documents". 
# minDF = 5 means "ignore terms that appear in less than 5 documents".
# max_features=int, default=None
#   If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

from sklearn.feature_extraction.text import CountVectorizer

def vectorize(corpus,MAX_FEATURES):
    vectorizer=CountVectorizer(max_features=MAX_FEATURES,stop_words="english")   
    # RUN COUNT VECTORIZER ON OUR COURPUS 
    Xs  =  vectorizer.fit_transform(corpus)   
    X=np.array(Xs.todense())
    #CONVERT TO ONE-HOT VECTORS (can also be done with binary=true in CountVectorizer)
    maxs=np.max(X,axis=0)
    return (np.ceil(X/maxs),vectorizer.vocabulary_)

(x,vocab0)=vectorize(txt,MAX_FEATURES=10000)

In [9]:
#swap keys and values (value --> ley)
vocab1 = dict([(value, key) for key, value in vocab0.items()])

In [11]:
#RE-ORDER COLUMN SO IT IS SORTED FROM HIGH FREQ TERMS TO LOW 
# https://stackoverflow.com/questions/60758625/sort-pandas-dataframe-by-sum-of-columns
txt3=pd.DataFrame(x)
s = txt3.sum(axis=0)
txt3=txt3[s.sort_values(ascending=False).index[:]]
print(txt3.head())

     0
0  1.0
1  0.0
2  0.0


#### Split data

Use index based methods (because x matrix will be re-defined repeatedly but want same rows to be used each time)

In [12]:
import random
N=x.shape[0]
l = [*range(N)]     # indices
cut = int(0.8 * N) #80% of the list
random.shuffle(l)   # randomize
train_index = l[:cut] # first 80% of shuffled list
test_index = l[cut:] # last 20% of shuffled list

print(train_index[0:10])
print(test_index[0:10])

[0, 2]
[1]


#### Training function

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import time

def train_MNB_model(X,Y,i_print=False):

    if(i_print):
        print(X.shape,Y.shape)

    #SPLIT
    x_train=X[train_index]
    y_train=Y[train_index].flatten()

    x_test=X[test_index]
    y_test=Y[test_index].flatten()

    # INITIALIZE MODEL 
    model = MultinomialNB()

    # TRAIN MODEL 
    start = time.process_time()
    model.fit(x_train,y_train)
    time_train=time.process_time() - start

    # LABEL PREDICTIONS FOR TRAINING AND TEST SET 
    start = time.process_time()
    yp_train = model.predict(x_train)
    yp_test = model.predict(x_test)
    time_eval=time.process_time() - start

    acc_train= accuracy_score(y_train, yp_train)*100
    acc_test= accuracy_score(y_test, yp_test)*100

    if(i_print):
        print(acc_train,acc_test,time_train,time_eval)

    return (acc_train,acc_test,time_train,time_eval)


#TEST
print(type(x),type(txt2))
print(x.shape,txt2.shape)
(acc_train,acc_test,time_train,time_eval)=train_MNB_model(x,txt2,i_print=True)

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(3, 1) (100, 3)
(3, 1) (100, 3)


ValueError: Found input variables with inconsistent numbers of samples: [2, 6]

#### Search-1: Remove features from high to low

In [216]:
##UTILITY FUNCTION TO INITIALIZE RELEVANT ARRAYS
def initialize_arrays():
    global num_features,train_accuracies
    global test_accuracies,train_time,eval_time
    num_features=[]
    train_accuracies=[]
    test_accuracies=[]
    train_time=[]
    eval_time=[]

In [217]:
# INITIALIZE ARRAYS
initialize_arrays()

# DEFINE SEARCH FUNCTION
def partial_grid_search(num_runs, min_index, max_index):
    for i in range(1, num_runs+1):
        # SUBSET FEATURES 
        upper_index=min_index+i*int((max_index-min_index)/num_runs)
        xtmp=x[:,0:upper_index]

        #TRAIN 
        (acc_train,acc_test,time_train,time_eval)=train_MNB_model(xtmp,y,i_print=False)

        if(i%5==0):
            print(i,upper_index,xtmp.shape[1],acc_train,acc_test)
            
        #RECORD 
        num_features.append(xtmp.shape[1])
        train_accuracies.append(acc_train)
        test_accuracies.append(acc_test)
        train_time.append(time_train)
        eval_time.append(time_eval)

# DENSE SEARCH (SMALL NUMBER OF FEATURES (FAST))
partial_grid_search(num_runs=100, min_index=0, max_index=1000)

# SPARSE SEARCH (LARGE NUMBER OF FEATURES (SLOWER))
partial_grid_search(num_runs=20, min_index=1000, max_index=10000)

5 50 50 69.085 68.52000000000001
10 100 100 72.7825 73.31
15 150 150 75.1475 74.78
20 200 200 77.4 77.03
25 250 250 79.5 79.09
30 300 300 80.28 80.05
35 350 350 80.74 80.36
40 400 400 81.1675 80.78999999999999
45 450 450 81.645 81.21000000000001
50 500 500 81.855 81.17
55 550 550 82.17750000000001 81.66
60 600 600 82.3925 81.82000000000001
65 650 650 82.8775 82.05
70 700 700 83.0475 82.26
75 750 750 83.2975 82.39
80 800 800 83.3475 82.37
85 850 850 83.34 82.61
90 900 900 83.6275 82.53
95 950 950 83.855 82.76
100 1000 1000 84.0225 82.84
5 3250 3250 85.8075 84.44
10 5500 5500 86.175 84.35000000000001
15 7750 7750 86.3725 84.65
20 10000 10000 86.7675 84.50999999999999
