In [1]:
%matplotlib inline

In [3]:
import nltk

In [23]:
import nltk.stem
from sklearn.feature_extraction.text import CountVectorizer
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [24]:
vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')

In [25]:
content = ["How to format my hard disk", "Hard disk format problems"]
X = vectorizer.fit_transform(content)

In [26]:
vectorizer.get_feature_names()

[u'disk', u'format', u'hard', u'problem']

In [10]:
print(X.toarray())

[[1 1 1 0]
 [1 1 1 1]]


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [30]:
vectorizer = StemmedTfidfVectorizer(min_df = 1, stop_words='english', decode_error='ignore')

In [35]:
vectorizer.fit_transform(content)
vectorizer.get_feature_names()

[u'disk', u'format', u'hard', u'problem']

## The bag of words model

One of the most important sub-tasks in pattern classification are **feature extraction and selection**. Some important criteria are mentioned below:
* Salient
* Invariant
* Discriminatory

### Stemming and Lemmatization

**Stemming** is the process of transforming a word into its root form. But, this technique can create non-real words. In contras to stemming, **lemmatization** aims to obtain canonical (grammatically correct) forms of the words, the so-called **lemmas**. Lemmatization is computationaly more difficult and expensive than stemming, and in practice, both stemming and lemmatization have little impact on the performance of text classification.

### N-grams

In the n-gram model, a token can be defined as a sequence of n items. Choose the **optimal** number of n depends on the language as well as the particular application. Examples
* Unigram: "El", "perro", "come"
* Bigram: "El perro", "perro come"
* Trigram: "El perro come"

### Bag words model drawbacks

* It doesn't cover word relations
* It doesn't capture negation correctly
* It totally fails with misspelled words

### Some experiments

In [2]:
from sklearn import neighbors

knn = neighbors.KNeighborsClassifier(n_neighbors = 2)
print knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=2, p=2, weights='uniform')


In [3]:
knn.fit([[1], [2], [3], [4], [5], [6]], [0, 0, 0, 1, 1, 1])
knn.predict(1.5)

array([0])

In [4]:
knn.predict(37)

array([1])

In [5]:
knn.predict(3.5)

array([0])

In [6]:
knn.predict_proba(3.5)

array([[ 0.5,  0.5]])

In [7]:
knn.predict_proba(1)

array([[ 1.,  0.]])

In [8]:
import re
code_match = re.compile('<pre>(.*?)</pre>',
                       re.MULTILINE | re.DOTALL)
link_match = re.compile('<a href="http://.*?".*?>(.*?)</a>',
                       re.MULTILINE | re.DOTALL)
tag_match = re.compile('<[^>]*>',
                      re.MULTILINE | re.DOTALL)

def extract_features_from_body(s):
    link_count_in_code = 0
    # count links in code
    for match_str in code_match.findall(s):
        link_count_in_code += len(link_match.findall(match_str))
        
    return len(link_match.findall(s)) - link_count_in_code

In [9]:
import numpy as np

In [None]:
X = np.asarray([extract_features_from_body(text) for post_id, text in fetch_posts() \
               if post_id in all_answers])
knn = neighbors.KNeighborsClassifier()
knn.fit(X,y)

In [10]:
X = np.asarray([1,2,3,4,5])

In [None]:
from sklearn.cross_validation import KFold
scores = []

cv = KFold(n=len(X), n_folds=10, indices=True)

for train, test in cv:
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]
    clf = neighbors.KNeighborsClassifier()
    clf.fit(X,Y)
    scores.append(clf.score(X_test, y_test))
    
print "Mean(scores)=%.5f\tStddev(scores)=%.5f"\
        % (np.mean(scores), np.std(scores))

### Designing more features

In [14]:
def extract_features_from_body(s):
    num_code_lines = 0
    link_count_in_code = 0
    code_free_s = s
    
    for match_str in code_match.findall(s):
        num_code_lines += match_str.count('\n')
        code_free_s = code_match.sub("", code_free_s)
        
        link_count_in_code += len(link_match.findall(match_str))
    
    links = link_match.findall(s)
    link_count = len(links)
    link_count -= link_count_in_code
    html_free_s = re.sub(" +", " ", 
                        tag_match.sub("", code_free_s)).replace("\n","")
    link_free_s = html_free_s
    
    for link in links:
        if link.lower().startswith("http://"):
            link_free_s = link_free_s.replace(link, "")
   
    num_text_tokens = html_free_s.count(" ")
    
    return num_text_tokens, num_code_lines, link_count

## More examples

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

def create_ngram_model():
    tfidf_ngrams = TfidfVectorizer(ngram_range = (1,3),
                                  analyzer="word", binary=False)
    clf = MultinomialNB()
    return Pipeline([('vect', tfidf_ngrams), ('clf', clf)])

In [11]:
from sklearn.metrics import precision_recall_curve, auc
from sklearn.cross_validation import ShuffleSplit

def train_model(clf_factory, X, Y):
    cv = ShuffleSplit(n=len(X), n_iter = 10, test_size=0.3, random_state=0)
    
    scores = []
    pr_scores = []
    
    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], y[test]
        
        clf = clf_factory()
        clf.fit(X_train, y_train)
        
        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        
        scores.append(test_score)
        proba = clf.predict_proba(X_test)
        
        precision, recall, pr_thresholds = \
        precision_recall_curve(y_test, proba[:,1])
        
        pr_scores.append(auc(recall, precision))
        summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores))
        print("%.3f\t%.3f\t%.3f\t%.3f" % summary)

### Grid Search

In [28]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score

def grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n = len(X), n_iter = 10, test_size = 0.3, random_state = 0
    )
    
    param_grid = dict(vect__ngram_range=[(1,1),(1,2),(1,3)],
                      vect__min_df=[1,2],
                      vect__stop_words=[None, "english"],
                      #vect__smooth_idf = [False, True],
                      #vect__use_idf = [False, True],
                      #vect__sublinear_tf = [False, True],
                      #vect__binary = [False, True]
                     )
    
    grid_search = GridSearchCV(clf_factory(),
                              param_grid,
                              cv = cv,
                              score_func = f1_score,
                              verbose=10)
    grid_search.fit(X,Y)
    
    return grid_search.best_estimator_

In [29]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian',\
              'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train', \
                                  categories=categories, shuffle=True, random_state=42)
X = twenty_train.data
y = twenty_train.target

clf = grid_search_model(create_ngram_model, X, y)

Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None, score=0.914454 -   0.9s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None, score=0.896755 -   1.0s

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    2.0s



[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None, score=0.935103 -   1.1s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None, score=0.904130 -   0.9s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None, score=0.914454 -   1.1s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None, score=0.892330 -   1.0s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None, score=0.911504 -   1.0s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None, score=0.882006 -   1.3s

[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:    5.1s
[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:    8.4s



[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None, score=0.910029 -   1.6s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=None, score=0.907080 -   1.7s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=english, score=0.952802 -   1.5s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=english, score=0.949853 -   1.2s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=english, score=0.974926 -   1.1s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=english, score=0

[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:   15.5s
[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:   21.0s



[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=english, score=0.955752 -   1.2s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__stop_words=english, score=0.958702 -   1.4s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=None, score=0.911504 -   5.2s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=None, score=0.896755 -   4.3s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=None, score=0.924779 -   4.2s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=None, score=0.907080 - 

[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:   47.1s
[Parallel(n_jobs=1)]: Done  32 jobs       | elapsed:  1.3min



[CV] vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=english, score=0.979351 -   3.6s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=english, score=0.969027 -   3.7s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=english, score=0.954277 -   3.8s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=english, score=0.955752 -   3.8s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=english, score=0.966077 -   3.2s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=englis

[Parallel(n_jobs=1)]: Done  41 jobs       | elapsed:  1.9min
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:  3.2min



[CV] vect__ngram_range=(1, 3), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 3), vect__min_df=1, vect__stop_words=english, score=0.954277 -   5.3s
[CV] vect__ngram_range=(1, 3), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 3), vect__min_df=1, vect__stop_words=english, score=0.964602 -   5.5s
[CV] vect__ngram_range=(1, 3), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 3), vect__min_df=1, vect__stop_words=english, score=0.979351 -   7.2s
[CV] vect__ngram_range=(1, 3), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 3), vect__min_df=1, vect__stop_words=english, score=0.973451 -   5.9s
[CV] vect__ngram_range=(1, 3), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 3), vect__min_df=1, vect__stop_words=english, score=0.961652 -   5.7s
[CV] vect__ngram_range=(1, 3), vect__min_df=1, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 3), vect__min_df=1, vect__stop_words=englis

[Parallel(n_jobs=1)]: Done  61 jobs       | elapsed:  4.3min
[Parallel(n_jobs=1)]: Done  72 jobs       | elapsed:  4.5min



[CV] vect__ngram_range=(1, 1), vect__min_df=2, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 1), vect__min_df=2, vect__stop_words=english, score=0.973451 -   1.0s
[CV] vect__ngram_range=(1, 1), vect__min_df=2, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 1), vect__min_df=2, vect__stop_words=english, score=0.971976 -   1.0s
[CV] vect__ngram_range=(1, 1), vect__min_df=2, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 1), vect__min_df=2, vect__stop_words=english, score=0.954277 -   1.0s
[CV] vect__ngram_range=(1, 1), vect__min_df=2, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 1), vect__min_df=2, vect__stop_words=english, score=0.958702 -   0.9s
[CV] vect__ngram_range=(1, 1), vect__min_df=2, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 1), vect__min_df=2, vect__stop_words=english, score=0.961652 -   1.1s
[CV] vect__ngram_range=(1, 1), vect__min_df=2, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 1), vect__min_df=2, vect__stop_words=englis

[Parallel(n_jobs=1)]: Done  85 jobs       | elapsed:  4.9min
[Parallel(n_jobs=1)]: Done  98 jobs       | elapsed:  5.7min



[CV] vect__ngram_range=(1, 2), vect__min_df=2, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 2), vect__min_df=2, vect__stop_words=english, score=0.955752 -   2.9s
[CV] vect__ngram_range=(1, 2), vect__min_df=2, vect__stop_words=english 
[CV]  vect__ngram_range=(1, 2), vect__min_df=2, vect__stop_words=english, score=0.967552 -   2.9s
[CV] vect__ngram_range=(1, 3), vect__min_df=2, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 3), vect__min_df=2, vect__stop_words=None, score=0.929204 -   7.8s
[CV] vect__ngram_range=(1, 3), vect__min_df=2, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 3), vect__min_df=2, vect__stop_words=None, score=0.915929 -   8.1s
[CV] vect__ngram_range=(1, 3), vect__min_df=2, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 3), vect__min_df=2, vect__stop_words=None, score=0.943953 -   7.9s
[CV] vect__ngram_range=(1, 3), vect__min_df=2, vect__stop_words=None .
[CV]  vect__ngram_range=(1, 3), vect__min_df=2, vect__stop_words=None, score=0.932153 - 

[Parallel(n_jobs=1)]: Done 113 jobs       | elapsed:  7.4min
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  8.1min finished





In [26]:
twenty_train.data[0]
twenty_train.data[1]
twenty_train.target[0]

1

In [30]:
clf

Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [32]:
twenty_test = fetch_20newsgroups(subset='test', \
                                  categories=categories, shuffle=True, random_state=42)
X_test = twenty_test.data
y_test = twenty_test.target

In [33]:
pred = clf.predict(X_test)

In [39]:
from sklearn import metrics

print("accuracy: %.3f") % metrics.accuracy_score(y_test, pred)
print metrics.classification_report(y_test, pred, target_names = categories)

accuracy: 0.905
                        precision    recall  f1-score   support

           alt.atheism       0.96      0.80      0.87       319
soc.religion.christian       0.92      0.94      0.93       389
         comp.graphics       0.94      0.88      0.91       396
               sci.med       0.83      0.97      0.90       398

           avg / total       0.91      0.90      0.90      1502



### Sentiment Analysis Test