# Text Pre Process Pipeline

## A quick example that can be easily applied to dataframes in machine learning problems using Python V3.0

In [39]:
# Import necessary packages

import numpy as np
import pandas as pd
from scipy import sparse as sp

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [40]:
# Import Natural language Toolkit example data and 'stopwords' set

from nltk.corpus import movie_reviews
from nltk.corpus import stopwords

### Get the _Movie Reviews_ Data into a Pandas DataFrame

In [41]:
docs = [(str(movie_reviews.raw(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [42]:
reviews = pd.DataFrame(docs)
reviews.columns=('X','y')

# The Category of a movie review is initially 'neg' or 'pos', changing here to 0 and 1, respectively

bin_encoder=LabelEncoder()
reviews.y=bin_encoder.fit_transform(reviews.y)

In [43]:
reviews.head(5)

Unnamed: 0,X,y
0,"plot : two teen couples go to a church party ,...",0
1,the happy bastard's quick movie review \ndamn ...,0
2,it is movies like these that make a jaded movi...,0
3,""" quest for camelot "" is warner bros . ' firs...",0
4,synopsis : a mentally unstable man undergoing ...,0


### Edit the _stopwords_ to include more words
#### (This might be useful to filter out certain words you don't want included, but may not necessarily be in the default 'stopwords' list)

In [44]:
mystopwords = (stopwords.words())
custom_stopwords = ('the','an','a','my','0','''''','!','nt','?','??','?!','%','&','UTC','(UTC)')

## Text Pre Process Pipeline

+ ** 'count vectorizer' ** : Transformation from sentences to all lower-case words, stopwords removed, vectorized
+ ** 'chi2score' ** : Transformation that selects top k features related to the target based on ChiSquare test statistics
+ ** 'tf_transformer' ** : Transformation that transforms the vector of top features to tf-idf representation

In [45]:
# Using a variable for the top k features to be selected
top_k_features=1000


text_processor = Pipeline([
    ('count vectorizer',CountVectorizer(stop_words=mystopwords,lowercase=True)),
    ('chi2score',SelectKBest(chi2,k=top_k_features)),
    ('tf_transformer',TfidfTransformer(use_idf=True))
])

### fit_transform Versus fit Versus Transform

In [46]:
proc_text = text_processor.fit_transform(reviews.X,reviews.y)
proc_fit = text_processor.fit(reviews.X,reviews.y)

In [47]:
# The tf-idf values for words in the first review that are among the top 1000 features is sparse matrix format
print(proc_text[0])

  (0, 809)	0.164433165993
  (0, 991)	0.0743296334105
  (0, 432)	0.106734979944
  (0, 648)	0.114679040775
  (0, 279)	0.0985524595764
  (0, 232)	0.100782880888
  (0, 656)	0.122673219235
  (0, 426)	0.142920192024
  (0, 625)	0.0664163951289
  (0, 80)	0.137863552511
  (0, 31)	0.0939876878729
  (0, 24)	0.0791696955562
  (0, 263)	0.139340845051
  (0, 616)	0.0971066083855
  (0, 796)	0.0946902435652
  (0, 46)	0.230092626547
  (0, 242)	0.0938884983857
  (0, 890)	0.205563664868
  (0, 923)	0.156071440997
  (0, 26)	0.085197028909
  (0, 395)	0.226496827186
  (0, 844)	0.138632543895
  (0, 785)	0.17581975964
  (0, 98)	0.0867664708053
  (0, 960)	0.170893610946
  (0, 281)	0.206862839258
  (0, 591)	0.164520445882
  (0, 321)	0.0988950410533
  (0, 94)	0.0719116061705
  (0, 382)	0.0924343944618
  (0, 318)	0.230689039315
  (0, 654)	0.208724455723
  (0, 985)	0.149218486031
  (0, 634)	0.137504053122
  (0, 708)	0.19060843773
  (0, 966)	0.0548912225715
  (0, 886)	0.155459309567
  (0, 584)	0.122673219235
  (0, 57

Returning the original words that ended up in the final 1000 words for a particular comment can still be accomplished by the following two steps:
+ Find the index of the top 1000 features returned from the 'chi2score' transformation
+ Find the 'feature names', i.e. the words from the original text

In [48]:
# proc_fit.named_steps['chi2score'].get_support(indices=True)

In [49]:
proc_fit.named_steps['chi2score'].get_support(indices=True)[616]

23078

In [50]:
proc_fit.named_steps['count vectorizer'].get_feature_names()[23078]

'music'

In [51]:
print(reviews.iloc[0,0])

plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
what's the deal ? 
watch the movie and " sorta " find out . . . 
critique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . 
which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn't snag this one correctly . 
they seem to have taken this pretty neat concept , but executed it terribly . 
so what are the problems with the movie ? 
well , its main problem is that it's simply too jumbled . 
it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have no id

#### Old Pipeline 1

In [None]:
chi2(X_res,y_res)

k=1000

ch2_score = SelectKBest(chi2, k=k)

toxic_feature_tran = ch2_score.fit(X,y)

X_train_k = ch2_score.fit_transform(X, y)

X_test_k = ch2_score.transform(X_test)

#### Old Pipeline 2

In [22]:
count_vect = CountVectorizer(stop_words=mystopwords,lowercase=True)

X_train_counts = count_vect.fit_transform(X)

tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tfidf = tf_transformer.transform(X_train_counts)