In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold, LeaveOneOut, ShuffleSplit
from sklearn.datasets import load_iris, load_digits, load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
%matplotlib inline

In [2]:
skf = StratifiedKFold(n_splits=3)
logreg = LogisticRegression()

cross_val_score(logreg, load_iris().data, load_iris().target, cv = ShuffleSplit())

array([1.        , 1.        , 0.93333333, 1.        , 0.93333333,
       0.93333333, 0.93333333, 0.93333333, 0.93333333, 1.        ])

In [3]:
digits = load_digits()
y = digits.target == 9

X_train, X_test, y_train, y_test = train_test_split(digits.data, y, random_state = 0)

In [4]:
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
pred_most_frequent = dummy_majority.predict(X_test)
np.unique(pred_most_frequent), dummy_majority.score(X_test, y_test)

(array([False]), 0.8955555555555555)

In [5]:
forest = RandomForestClassifier(n_estimators=100, max_depth=20).fit(X_train, y_train)
pred_forest = forest.predict(X_test)
forest.score(X_test, y_test)

0.9777777777777777

In [6]:
logreg = LogisticRegression().fit(X_train, y_train)
pred_logreg = logreg.predict(X_test)
logreg.score(X_test, y_test)

0.9755555555555555

In [7]:
confusion_matrix(y_test, pred_logreg)

array([[399,   4],
       [  7,  40]], dtype=int64)

In [8]:
confusion_matrix(y_test, pred_forest)

array([[403,   0],
       [ 10,  37]], dtype=int64)

### pipeline

In [9]:
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state = 0)

pipe = Pipeline([("scaler", MinMaxScaler()),("svm",SVC())])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.951048951048951

In [10]:
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],'svm__gamma':[0.001, 0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(pipe, param_grid = param_grid, cv = 5)
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.score(X_test, y_test))
print(grid.best_params_)

0.9812206572769953
0.972027972027972
{'svm__C': 1, 'svm__gamma': 1}


### bag-of-words

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

bards_words = ["The fool doth think he is wise,", "but the wise man knows himself to be a fool"]
vect = CountVectorizer().fit(bards_words)
vect.transform(bards_words).toarray()

array([[0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1],
       [1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1]], dtype=int64)

In [13]:
vect = CountVectorizer(ngram_range=(1,1)).fit(bards_words)
print(vect.get_feature_names())

['be', 'but', 'doth', 'fool', 'he', 'himself', 'is', 'knows', 'man', 'the', 'think', 'to', 'wise']


In [14]:
vect = CountVectorizer(ngram_range=(2,2)).fit(bards_words)
print(vect.get_feature_names())

['be fool', 'but the', 'doth think', 'fool doth', 'he is', 'himself to', 'is wise', 'knows himself', 'man knows', 'the fool', 'the wise', 'think he', 'to be', 'wise man']


In [3]:
vect = CountVectorizer(ngram_range=(1,3)).fit(bards_words)
print(vect.get_feature_names())

['be', 'be fool', 'but', 'but the', 'but the wise', 'doth', 'doth think', 'doth think he', 'fool', 'fool doth', 'fool doth think', 'he', 'he is', 'he is wise', 'himself', 'himself to', 'himself to be', 'is', 'is wise', 'knows', 'knows himself', 'knows himself to', 'man', 'man knows', 'man knows himself', 'the', 'the fool', 'the fool doth', 'the wise', 'the wise man', 'think', 'think he', 'think he is', 'to', 'to be', 'to be fool', 'wise', 'wise man', 'wise man knows']


In [4]:
from sklearn.datasets import load_files

reviews_train = load_files("data/train/")
text_train, y_train = reviews_train.data, reviews_train.target
print(text_train[0])
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

df = pd.DataFrame({'X_train': text_train, 'y_train': y_train})
X_train = df[df['y_train']!=2]['X_train']
y_train = df[df['y_train']!=2]['y_train']

review_text = load_files("data/test/")
text_test, y_test = review_text.data, review_text.target
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

b'Full of (then) unknown actors TSF is a great big cuddly romp of a film.<br /><br />The idea of a bunch of bored teenagers ripping off the local sink factory is odd enough, but add in the black humour that Forsyth & Co are so good at and your in for a real treat.<br /><br />The comatose van driver by itself worth seeing, and the canal side chase is just too real to be anything but funny.<br /><br />And for anyone who lived in Glasgow it\'s a great "Oh I know where that is" film.'


In [17]:
vect.fit(X_train)
X_train = vect.transform(X_train)
X_test = vect.transform(text_test)

from sklearn.model_selection import cross_val_score

cross_val_score(LogisticRegression(), X_train, y_train, cv = 5).mean()

0.8282399999999999

In [18]:
param_grid = {'C':[ 0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
grid.fit(X_train, y_train)
grid.best_score_, grid.best_params_, grid.score(X_test, y_test)

(0.82964, {'C': 10}, 0.84212)

In [19]:
X_train = df[df['y_train']!=2]['X_train']

vect = CountVectorizer(min_df=5).fit(X_train)

X_train = vect.transform(X_train)
X_test = vect.transform(text_test)
grid = GridSearchCV(LogisticRegression(), param_grid, cv = 5).fit(X_train, y_train)
grid.best_score_, grid.best_params_, grid.score(X_test, y_test)

(0.8874, {'C': 0.1}, 0.87784)

In [20]:
X_train = df[df['y_train']!=2]['X_train']

vect = CountVectorizer(min_df = 5, stop_words = 'english').fit(X_train)

X_train = vect.transform(X_train)
X_test = vect.transform(text_test)
grid = GridSearchCV(LogisticRegression(), param_grid, cv = 5).fit(X_train, y_train)
grid.best_score_, grid.best_params_, grid.score(X_test, y_test)

(0.88368, {'C': 0.1}, 0.87252)

### tf-idf

In [21]:
# TfidfTransformer: take in the sparse matrix output produced by CountVectorizer and transform it.
# TfidfVectorizer: take in the text data and do both the bag-of-words feature extraction and the tf-idf transmation

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
pipe = make_pipeline(TfidfVectorizer(min_df = 5, norm = None), LogisticRegression())

grid = GridSearchCV(pipe, param_grid, cv = 5)
X_train = df[df['y_train']!=2]['X_train']
grid.fit(X_train, y_train)
grid.best_score_

0.89416

In [22]:
vectorizer = grid.best_estimator_.named_steps['tfidfvectorizer']
X_train = df[df['y_train']!=2]['X_train']
X_train = vectorizer.transform(X_train)
max_value = X_train.max(axis= 0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()

feature_names = np.array(vectorizer.get_feature_names())

feature_names[sorted_by_tfidf[:10]], feature_names[sorted_by_tfidf[-10:]]

(array(['poignant', 'disagree', 'instantly', 'importantly', 'lacked',
        'occurred', 'currently', 'altogether', 'nearby', 'undoubtedly'],
       dtype='<U20'),
 array(['dominick', 'the', 'victor', 'bridget', 'victoria', 'khouri',
        'zizek', 'rob', 'timon', 'titanic'], dtype='<U20'))

In [23]:
# low inverse document frequency: those appear frequently and deemed less important
sorted_by_idf = np.argsort(vectorizer.idf_)
feature_names[sorted_by_idf[:50]]

array(['the', 'and', 'of', 'to', 'this', 'is', 'it', 'in', 'that', 'but',
       'for', 'with', 'was', 'as', 'on', 'movie', 'not', 'have', 'one',
       'be', 'film', 'are', 'you', 'all', 'at', 'an', 'by', 'so', 'from',
       'like', 'who', 'they', 'there', 'if', 'his', 'out', 'just',
       'about', 'he', 'or', 'has', 'what', 'some', 'good', 'can', 'more',
       'when', 'time', 'up', 'very'], dtype='<U20')

In [24]:
df2 = pd.DataFrame(grid.best_estimator_.named_steps['logisticregression'].coef_.reshape(-1,1), index = feature_names)
df2.sort_values(by = 0,inplace= True)
pd.concat([df2[:20], df2[-20:]]).T

Unnamed: 0,worst,waste,awful,bad,boring,poor,poorly,worse,terrible,disappointment,...,fun,today,favorite,amazing,loved,wonderful,best,perfect,excellent,great
0,-0.241149,-0.189228,-0.168824,-0.167629,-0.153832,-0.128468,-0.120763,-0.120657,-0.117846,-0.116706,...,0.093707,0.093835,0.099491,0.100521,0.102367,0.110598,0.11524,0.1201,0.154079,0.162401


In [25]:
pipe = make_pipeline(TfidfVectorizer(min_df = 5),LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10],
             'tfidfvectorizer__ngram_range':[(1,1), (1,2), (1,3)]}

X_train = df[df['y_train']!=2]['X_train']
grid = GridSearchCV(pipe, param_grid, cv = 5).fit(X_train, y_train)
grid.best_score_, grid.best_params_

(0.90576,
 {'logisticregression__C': 10, 'tfidfvectorizer__ngram_range': (1, 3)})

In [26]:
vect = grid.best_estimator_.named_steps['tfidfvectorizer']
feature_names = np.array(vect.get_feature_names())
coef = grid.best_estimator_.named_steps['logisticregression'].coef_
df3 = pd.DataFrame(coef.reshape(-1,1), index = feature_names)
df3.sort_values(by = 0,inplace= True)
pd.concat([df3[:20], df3[-20:]]).T

Unnamed: 0,bad,worst,awful,boring,the worst,poor,waste,terrible,worse,no,...,the best,loved,best,today,fun,amazing,wonderful,perfect,excellent,great
0,-13.628529,-13.555011,-11.964566,-11.358971,-10.358862,-10.282947,-9.535128,-8.478191,-8.356218,-8.071425,...,6.17336,6.187728,6.426293,6.984822,7.226938,7.339721,8.698145,9.022522,10.519394,13.034443


In [42]:
mask = np.array([len(feature.split()) for feature in df3.index.values])==3
df3[mask]

Unnamed: 0,0
of the worst,-5.487569
waste of time,-3.093326
supposed to be,-2.710675
none of the,-2.536097
the worst movie,-2.397477
to sit through,-2.380264
first of all,-2.297215
not one of,-2.289769
some kind of,-2.263779
of the movie,-2.254693


#### Advanced tokenization, stemming and lemmatization

In [9]:
# stemming
# lemmatization
import spacy
import nltk

en_nlp = spacy.load('en')
stemmer = nltk.stem.PorterStemmer()

def compare_normalization(doc):
    doc_spacy = en_nlp(doc)
    print('Lmmatization:')
    print([token.lemma_ for token in doc_spacy])
    print('Stemming:')
    print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])

In [10]:
compare_normalization(u"Our meeting today was worse than yesterday," "I'm scared of meeting the clients tomorrow.")

Lmmatization:
['-PRON-', 'meeting', 'today', 'be', 'bad', 'than', 'yesterday', ',', "i'm", 'scar', 'of', 'meet', 'the', 'client', 'tomorrow', '.']
Stemming:
['our', 'meet', 'today', 'wa', 'wors', 'than', 'yesterday', ',', "i'm", 'scare', 'of', 'meet', 'the', 'client', 'tomorrow', '.']
