In [1]:
##### manipulating text data

# Text Preprocessing

In [75]:
import numpy as np
import pandas as pd

In [1]:
# text
text = """Pirates of The Caribbean is quite simply Hollywood's best pirate 
film in ages; a funny, rollicking swashbuckler that pays homage to the great 
films of the 1930's and 1940's featuring the likes of Errol Flynn, Charles 
Laughton, among others."""
text

"Pirates of The Caribbean is quite simply Hollywood's best pirate \nfilm in ages; a funny, rollicking swashbuckler that pays homage to the great \nfilms of the 1930's and 1940's featuring the likes of Errol Flynn, Charles \nLaughton, among others."

In [77]:
# lower case
text= text.lower()
# remove digits  % punctuation
import re
import string
pattern1=string.digits
pattern2=string.punctuation
regex=re.compile(r"[%s%s]" % (pattern1,pattern2))
text=regex.sub(' ',text)
# replace one or more white-space characters with a space

regex=re.compile(r"\s+")
text=regex.sub(' ', text)
# remove stop words
import nltk
from nltk.corpus import stopwords
sw=stopwords.words('english')
text=' '.join([w for w in text.split() if w not in sw])

text

'pirates caribbean quite simply hollywood best pirate film ages funny rollicking swashbuckler pays homage great films featuring likes errol flynn charles laughton among others'

In [78]:
# remove short words
text=' '.join([w for w in text.split() if len(w)>4])
text

'pirates caribbean quite simply hollywood pirate funny rollicking swashbuckler homage great films featuring likes errol flynn charles laughton among others'

In [79]:
# retain only unique words
text=' '.join(set(text.split()))
text

'rollicking pirate flynn homage caribbean errol quite hollywood pirates others films funny swashbuckler among likes charles laughton great featuring simply'

In [80]:
# stemming （word root）
# https://www.nltk.org/api/nltk.stem.html#module-nltk.stem.porter
from nltk.stem.porter import PorterStemmer
' '.join([(PorterStemmer()).stem(w) for w in text.split()]) 

#ps = PorterStemmer()
#ps.stem("wolves")

'rollick pirat flynn homag caribbean errol quit hollywood pirat other film funni swashbuckl among like charl laughton great featur simpli'

In [50]:
# lemmatization attempts to get the word root through vocabulary and morphological analysis
# https://www.nltk.org/api/nltk.stem.html#module-nltk.stem.wordnet
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
' '.join([(WordNetLemmatizer()).lemmatize(w) for w in text.split()]) 
# lem = WordNetLemmatizer()
# lem.lemmatize("wolves")

'pirate caribbean quite simply hollywood best pirate film age funny rollicking swashbuckler pay homage great film featuring like errol flynn charles laughton among others'

# Test Vectorization

In [83]:
corpus = ["This is a brown house. This house is big. The street number is 1.",
          "This is a small house. This house has 1 bedroom. The street number is 12.",
          "This dog is brown. This dog likes to play.",
          "The dog is in the bedroom."]


In [89]:
# Word Occurrence 
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(lowercase=True, stop_words='english',
                   min_df=1, max_df=1.0,max_features=None,ngram_range=(1,1))

# max_df (default 1.0): When building the vocabulary ignore terms that have a document frequency 
# strictly higher than the given threshold (corpus-specific stop words). 
# (min: lower than the given threshold.)

df=pd.DataFrame(cv.fit_transform(corpus).toarray(),columns=cv.get_feature_names())
print(cv.vocabulary_)
print(cv.stop_words_)
df

{'brown': 3, 'house': 5, 'big': 2, 'street': 10, 'number': 7, 'small': 9, 'bedroom': 1, '12': 0, 'dog': 4, 'likes': 6, 'play': 8}
set()


Unnamed: 0,12,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0,0,1,1,0,2,0,1,0,0,1
1,1,1,0,0,0,2,0,1,0,1,1
2,0,0,0,1,2,0,1,0,1,0,0
3,0,1,0,0,1,0,0,0,0,0,0


# Normalized Word Occurrence (TF-IDF) 

In [94]:

from sklearn.feature_extraction.text import TfidfVectorizer

tv=TfidfVectorizer(use_idf=False, norm='l2',lowercase=True,stop_words='english',
                 min_df=1, max_df=1.0, max_features=None,ngram_range=(1,1))

df=pd.DataFrame(tv.fit_transform(corpus).toarray(),columns=tv.get_feature_names())
df

Unnamed: 0,12,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,0.0,0.353553,0.353553,0.0,0.707107,0.0,0.353553,0.0,0.0,0.353553
1,0.333333,0.333333,0.0,0.0,0.0,0.666667,0.0,0.333333,0.0,0.333333,0.333333
2,0.0,0.0,0.0,0.377964,0.755929,0.0,0.377964,0.0,0.377964,0.0,0.0
3,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
# Tf-Idf with smooth (1+N(t))

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(
    use_idf=True, smooth_idf=True, norm='l2',
    lowercase=True, 
    stop_words='english', 
    min_df=1, max_df=1.0, max_features=None, 
    ngram_range=(1, 1))
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names())
df

Unnamed: 0,12,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,0.0,0.432291,0.340823,0.0,0.681647,0.0,0.340823,0.0,0.0,0.340823
1,0.396802,0.312843,0.0,0.0,0.0,0.625687,0.0,0.312843,0.0,0.396802,0.312843
2,0.0,0.0,0.0,0.348842,0.697684,0.0,0.442462,0.0,0.442462,0.0,0.0
3,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
# doc corpus 

document1 = """In Greek mythology, Python (Greek: Πύθων, gen.: Πύθωνος) was the earth-dragon of 
Delphi, always represented in Greek sculpture and vase-paintings as a serpent. He presided at the 
Delphic oracle, which existed in the cult center for his mother, Gaia, "Earth," Pytho being the 
place name that was substituted for the earlier Krisa.[1] Hellenes considered the site to be the 
center of the earth, represented by a stone, the omphalos or navel, which Python guarded."""

document2 = """Monty Python (sometimes known as The Pythons)[2][3] were a British surreal comedy 
group who created the sketch comedy show Monty Python's Flying Circus, that first aired on the BBC on 
October 5, 1969. Forty-five episodes were made over four series. The Python phenomenon developed from 
the television series into something larger in scope and impact, spawning touring stage shows, films, 
numerous albums, several books, and a stage musical. The group's influence on comedy has been compared 
to The Beatles' influence on music."""

document3 = """Python is a widely used general-purpose, high-level programming language.[19][20] 
Its design philosophy emphasizes code readability, and its syntax allows programmers to express 
concepts in fewer lines of code than would be possible in languages such as C++ or Java.[21][22] 
The language provides constructs intended to enable clear programs on both a small and large scale."""

corpus = [document1, document2, document3]


In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(
    use_idf=True, smooth_idf=True, norm='l2',
    lowercase=True, 
    stop_words='english', 
    min_df=1, max_df=1.0, max_features=None, 
    ngram_range=(1, 1))
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names())
df

Unnamed: 0,19,1969,20,21,22,aired,albums,allows,bbc,beatles,...,substituted,surreal,syntax,television,touring,used,vase,widely,πύθων,πύθωνος
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.133161,0.0,0.0,0.0,0.0,0.0,0.133161,0.0,0.133161,0.133161
1,0.0,0.126858,0.0,0.0,0.0,0.126858,0.126858,0.0,0.126858,0.126858,...,0.0,0.126858,0.0,0.126858,0.126858,0.0,0.0,0.0,0.0,0.0
2,0.153667,0.0,0.153667,0.153667,0.153667,0.0,0.0,0.153667,0.0,0.0,...,0.0,0.0,0.153667,0.0,0.0,0.153667,0.0,0.153667,0.0,0.0


In [98]:

print ("\ndocument 0:")
print (df.loc[0].sort_values(ascending=False)[:5])

print ("\ndocument 1:")
print (df.loc[1].sort_values(ascending=False)[:5])

print ("\ndocument 2:")
print (df.loc[2].sort_values(ascending=False)[:5])


document 0:
greek          0.399484
earth          0.399484
represented    0.266323
center         0.266323
python         0.157295
Name: 0, dtype: float64

document 1:
comedy       0.380573
group        0.253715
stage        0.253715
series       0.253715
influence    0.253715
Name: 1, dtype: float64

document 2:
language       0.307333
code           0.307333
general        0.153667
programming    0.153667
programmers    0.153667
Name: 2, dtype: float64


The highest 5 terms should be unique identifier of each documents with high TF-IDF scores

# Text Classification with Naive Bayes

### Load Data

In [2]:
# read the data into a pandas dataframe
import pandas as pd
import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfneg = data2df('Data/MoviePosNeg/neg/', 0) # NEG
dfpos = data2df('Data/MoviePosNeg/pos/', 1) # POS

df = pd.concat([dfpos, dfneg], axis=0)
df.sample(frac=0.005)

Unnamed: 0,file,text,class
593,cv712_24217.txt,preposterous religious action film ( produced ...,0
379,cv860_15520.txt,i guess that if a very wild bachelor party had...,0
936,cv621_15984.txt,"synopsis : sullen julie james , still haunted ...",0
402,cv723_8648.txt,director jan de bont certainly knows how to ma...,1
2,cv155_7845.txt,""" gordy "" is not a movie , it is a 90-minute-...",0
380,cv914_2856.txt,woof ! too bad that leap of faith was the titl...,0
638,cv057_7962.txt,supposedly based on a true story in which the ...,0
501,cv021_15838.txt,one can not observe a star trek movie and expe...,1
581,cv038_9749.txt,sometimes a movie comes along that falls somew...,1
686,cv644_18551.txt,plot : two sister witches have to live with a ...,0


### Step 2:  Set up data and split data


In [3]:
# setup the data
X, y = df['text'], df['class']
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

<h1><center>Setup Preprocessing and Tfidf Vectorization</center></h1>

In [5]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
def preprocess(text):
    # replace one or more white-space characters with a space
    regex = re.compile(r"\s+")                               
    text = regex.sub(' ', text)    
    # lower case
    text = text.lower()          
    # remove digits and punctuation
    regex = re.compile(r"[%s%s]" % (string.punctuation, string.digits))
    text = regex.sub(' ', text)           
    # remove stop words
    sw = stopwords.words('english')
    text = text.split()                                              
    text = ' '.join([w for w in text if w not in sw]) 
    # remove short words
    ' '.join([w for w in text.split() if len(w) >= 2])
    # lemmatize
    text = ' '.join([(WordNetLemmatizer()).lemmatize(w) for w in text.split()]) 
    return text

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tv = TfidfVectorizer(
#     preprocessor=preprocess,
#     #lowercase=True, stop_words='english', 
#     use_idf=True, smooth_idf=True, norm='l2',
#     min_df=1, max_df=1.0, max_features=None, 

#     ngram_range=(1, 1))
# XTtrain = pd.DataFrame(tv.fit_transform(Xtrain).toarray(), columns=tv.get_feature_names())
# XTtrain.head()

### Step 4: Pipeline with preprocess and TF-IDF

In [6]:
# setup the preprocessing->model pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
clf = Pipeline(steps=[
    ('pp', TfidfVectorizer(
        #preprocessor=preprocess, (now is using default preprocessor)
        lowercase=True, stop_words='english', 
        use_idf=True, smooth_idf=True, norm='l2',
        min_df=1, max_df=1.0, max_features=None, 
        ngram_range=(1, 1))),
    ('mdl',     MultinomialNB())  #Naive Bayes Classifier
    #('mdl',     RandomForestClassifier())
    ])

### Step 5: Fit best parameter with Grid Search

In [7]:
# setup grid search
from sklearn.model_selection import GridSearchCV
param_grid = {
    'mdl__alpha':[0.01, 0.1, 0.2, 0.5, 1]
    #'mdl__n_estimators':[500, 700, 1000]
}
gscv = GridSearchCV(clf, param_grid, iid=False, cv=4, return_train_score=False)

In [8]:
# search for best parameters/estimator
gscv.fit(Xtrain, ytrain)

#print(gscv.best_estimator_, "\n")
#print(gscv.best_score_, "\n")
print(gscv.best_params_, "\n")
#print(gscv.cv_results_, "\n")

{'mdl__alpha': 1} 



### Step 6: Prediction and Evaluation

In [9]:
# evaluate best_estimator_ on test data
ypred = gscv.best_estimator_.predict(Xtest)
from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.835
[[173  32]
 [ 34 161]]
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       205
           1       0.83      0.83      0.83       195

   micro avg       0.83      0.83      0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400

