In [1]:
%matplotlib inline

In [3]:
import nltk

In [23]:
import nltk.stem
from sklearn.feature_extraction.text import CountVectorizer
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [24]:
vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')

In [25]:
content = ["How to format my hard disk", "Hard disk format problems"]
X = vectorizer.fit_transform(content)

In [26]:
vectorizer.get_feature_names()

[u'disk', u'format', u'hard', u'problem']

In [10]:
print(X.toarray())

[[1 1 1 0]
 [1 1 1 1]]


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [30]:
vectorizer = StemmedTfidfVectorizer(min_df = 1, stop_words='english', decode_error='ignore')

In [35]:
vectorizer.fit_transform(content)
vectorizer.get_feature_names()

[u'disk', u'format', u'hard', u'problem']

## The bag of words model

One of the most important sub-tasks in pattern classification are **feature extraction and selection**. Some important criteria are mentioned below:
* Salient
* Invariant
* Discriminatory

### Stemming and Lemmatization

**Stemming** is the process of transforming a word into its root form. But, this technique can create non-real words. In contras to stemming, **lemmatization** aims to obtain canonical (grammatically correct) forms of the words, the so-called **lemmas**. Lemmatization is computationaly more difficult and expensive than stemming, and in practice, both stemming and lemmatization have little impact on the performance of text classification.

### N-grams

In the n-gram model, a token can be defined as a sequence of n items. Choose the **optimal** number of n depends on the language as well as the particular application. Examples
* Unigram: "El", "perro", "come"
* Bigram: "El perro", "perro come"
* Trigram: "El perro come"

### Bag words model drawbacks

* It doesn't cover word relations
* It doesn't capture negation correctly
* It totally fails with misspelled words

### Some experiments

In [2]:
from sklearn import neighbors

knn = neighbors.KNeighborsClassifier(n_neighbors = 2)
print knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=2, p=2, weights='uniform')


In [3]:
knn.fit([[1], [2], [3], [4], [5], [6]], [0, 0, 0, 1, 1, 1])
knn.predict(1.5)

array([0])

In [4]:
knn.predict(37)

array([1])

In [5]:
knn.predict(3.5)

array([0])

In [6]:
knn.predict_proba(3.5)

array([[ 0.5,  0.5]])

In [7]:
knn.predict_proba(1)

array([[ 1.,  0.]])

In [8]:
import re
code_match = re.compile('<pre>(.*?)</pre>',
                       re.MULTILINE | re.DOTALL)
link_match = re.compile('<a href="http://.*?".*?>(.*?)</a>',
                       re.MULTILINE | re.DOTALL)
tag_match = re.compile('<[^>]*>',
                      re.MULTILINE | re.DOTALL)

def extract_features_from_body(s):
    link_count_in_code = 0
    # count links in code
    for match_str in code_match.findall(s):
        link_count_in_code += len(link_match.findall(match_str))
        
    return len(link_match.findall(s)) - link_count_in_code

In [9]:
import numpy as np

In [None]:
X = np.asarray([extract_features_from_body(text) for post_id, text in fetch_posts() \
               if post_id in all_answers])
knn = neighbors.KNeighborsClassifier()
knn.fit(X,y)

In [10]:
X = np.asarray([1,2,3,4,5])

In [None]:
from sklearn.cross_validation import KFold
scores = []

cv = KFold(n=len(X), n_folds=10, indices=True)

for train, test in cv:
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]
    clf = neighbors.KNeighborsClassifier()
    clf.fit(X,Y)
    scores.append(clf.score(X_test, y_test))
    
print "Mean(scores)=%.5f\tStddev(scores)=%.5f"\
        % (np.mean(scores), np.std(scores))

### Designing more features

In [14]:
def extract_features_from_body(s):
    num_code_lines = 0
    link_count_in_code = 0
    code_free_s = s
    
    for match_str in code_match.findall(s):
        num_code_lines += match_str.count('\n')
        code_free_s = code_match.sub("", code_free_s)
        
        link_count_in_code += len(link_match.findall(match_str))
    
    links = link_match.findall(s)
    link_count = len(links)
    link_count -= link_count_in_code
    html_free_s = re.sub(" +", " ", 
                        tag_match.sub("", code_free_s)).replace("\n","")
    link_free_s = html_free_s
    
    for link in links:
        if link.lower().startswith("http://"):
            link_free_s = link_free_s.replace(link, "")
   
    num_text_tokens = html_free_s.count(" ")
    
    return num_text_tokens, num_code_lines, link_count