### Summary:

In this notebook, we generate features by tokenizing and transforming the review text. All other features are created in the "Additional Features" Notebook

In [59]:
import os

# Packages for Data Manipulation
import pandas as pd
import numpy as np
from scipy import sparse

# Packages for Text Processing
from sklearn.feature_extraction import text as tx
from nltk import word_tokenize
from nltk.corpus import words, stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer
import enchant
import spacy

# Plotting
import plotly.express as px
import matplotlib.pyplot as plt


In [65]:
plt.style.use('seaborn-darkgrid')
%matplotlib inline

# Use Enchant English Dictionary
d = enchant.Dict("en_US")

# Load Spacy English Language Model (Note, you need to install Spacy models in addition to the package)
# sp = spacy.load('en_core_web_sm')

In [5]:
# Raw Data Import
df_trn = pd.read_csv(os.path.join('..', 'data', 'train_raw.csv'), parse_dates=['date'], index_col=['ex_id'])
df_val = pd.read_csv(os.path.join('..', 'data', 'dev_raw.csv'), parse_dates=['date'], index_col=['ex_id'])

### CountVectorizer

In [6]:
# Train Count Vectorizer
start = pd.Timestamp.now()
count_trans = tx.CountVectorizer(strip_accents='unicode').fit(df_trn['review'])
end = pd.Timestamp.now() 
print((end - start)/pd.Timedelta('1s'))

101.004956


In [7]:
start = pd.Timestamp.now()
X_count_trn = count_trans.transform(df_trn['review'])
X_count_val = count_trans.transform(df_val['review'])
end = pd.Timestamp.now() 
print((end - start)/pd.Timedelta('1s'))

115.271968


### CountVectorizer + Lemmatizer

In [8]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
    

In [9]:
start = pd.Timestamp.now()
count_lem_trans = tx.CountVectorizer(tokenizer=LemmaTokenizer(), strip_accents='unicode').fit(df_trn['review'])
end = pd.Timestamp.now() 
print((end - start)/pd.Timedelta('1m'))


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



18.341669766666666


In [10]:
start = pd.Timestamp.now()
X_count_lem_trn = count_lem_trans.transform(df_trn['review'])
X_count_lem_val = count_lem_trans.transform(df_val['review'])
end = pd.Timestamp.now() 
print((end - start)/pd.Timedelta('1m'))

20.6067006


In [13]:
X_count_lem_trn.shape

(250874, 182746)

In [None]:
sparse.save_npz('wordcount_train', X_count_lem_trn)
sparse.save_npz('wordcount_valid', X_count_lem_val)


#### Convert to TFIDF and export

In [16]:
tfidf_trans = tx.TfidfTransformer().fit(X_count_lem_trn)

In [None]:
X_tfidf_trn = tfidf_trans.transform(X_count_lem_trn)
X_tfidf_val = tfidf_trans.transform(X_count_lem_val)

In [None]:
sparse.save_npz('tfidfnorm_train', X_tfidf_trn)
sparse.save_npz('tfidfnorm_valid', X_tfidf_val)


In [None]:
tfidf_trans = tx.TfidfTransformer(norm=None).fit(X_count_lem_trn)

In [None]:
X_tfidf_trn = tfidf_trans.transform(X_count_lem_trn)
X_tfidf_val = tfidf_trans.transform(X_count_lem_val)

In [None]:
sparse.save_npz('tfidfraw_train', X_count_lem_trn)
sparse.save_npz('tfidfraw_valid', X_count_lem_val)

### TF-IDF + Lemmatizer + Remove words w/ 2 or fewer occurences + Remove English Stop Words

#### First remove infrequent words from the CountVectorizer that we trained earlier

In [None]:
word_count = X_count_lem_trn.sum(axis=0)

In [24]:
pd.DataFrame(word_count.T).iloc[:,0].value_counts()

1        110049
2         20169
3          9176
4          5599
5          3796
          ...  
4168          1
30917         1
12492         1
4552          1
10245         1
Name: 0, Length: 2628, dtype: int64

In [43]:
# Remove words with 1 or 2
s_keep = pd.DataFrame(word_count.T, index=count_lem_trans.get_feature_names())

In [44]:
len(s_keep)

182746

In [52]:
s_keep = s_keep[s_keep>=2].dropna()

In [72]:
len(s_keep)

72697

#### Now Remove English Stop Words

In [70]:
s_keep_idx = [x for x in s_keep.index if x not in set(stopwords.words('english'))]

In [71]:
len(s_keep_idx)/len(s_keep)

0.998046686933436

In [74]:
s_keep = s_keep.loc[s_keep_idx]

In [75]:
len(s_keep)

72555

In [76]:
keep_idx = [count_lem_trans.vocabulary_[x] for x in s_keep.index]

In [78]:
X_trn = X_count_lem_trn[:, keep_idx]
X_val = X_count_lem_val[:, keep_idx]

#### Convert Word Count to TFIDF

In [79]:
tfidf_trans = tx.TfidfTransformer().fit(X_trn)

In [80]:
X_tfidf_trn = tfidf_trans.transform(X_trn)
X_tfidf_val = tfidf_trans.transform(X_val)

In [81]:
sparse.save_npz('tfidfnorm_trim_train', X_tfidf_trn)
sparse.save_npz('tfidfnorm_trim_valid', X_tfidf_val)


In [82]:
tfidf_trans = tx.TfidfTransformer(norm=None).fit(X_trn)
X_tfidf_trn = tfidf_trans.transform(X_trn)
X_tfidf_val = tfidf_trans.transform(X_val)
sparse.save_npz('tfidfraw_trim_train', X_tfidf_trn)
sparse.save_npz('tfidfraw_trim_valid', X_tfidf_val)