In [1]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.rcParams["figure.dpi"] = 300
np.set_printoptions(precision=3, suppress=True)
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale, StandardScaler

In [2]:
!tree -dL 2 ../data/aclImdb

[01;34m../data/aclImdb[00m
├── [01;34mtest[00m
│   ├── [01;34mneg[00m
│   └── [01;34mpos[00m
└── [01;34mtrain[00m
    ├── [01;34mneg[00m
    └── [01;34mpos[00m

6 directories


In [3]:
from sklearn.datasets import load_files

reviews_train = load_files("../data/aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("text_train[1]:\n{}".format(text_train[1]))

type of text_train: <class 'list'>
length of text_train: 25000
text_train[1]:
b'Words can\'t describe how bad this movie is. I can\'t explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There are so many clich\xc3\xa9s, mistakes (and all other negative things you can imagine) here that will just make you cry. To start with the technical first, there are a LOT of mistakes regarding the airplane. I won\'t list them here, but just mention the coloring of the plane. They didn\'t even manage to show an airliner in the colors of a fictional airline, but instead used a 747 painted in the original Boeing livery. Very bad. The plot is stupid and has been done many times before, only much, much better. There are so many ridiculous moments here that i lost count of it really early. Also, I was on the bad guys\' side all the time in the movie, because the good guys were so stupid. "Executive Decisio

In [20]:
print(text_train[11451].decode())

... than this ;-) What would happen if Terry Gilliam and Douglas Adams would have worked together on one movie? This movie starts with a touch of Brazil... when, at a certain point, the story moves straight into the twilight zone... bringing up nothing new, but just nothing... and nothing is great fun! When Dave and Andrew starts to explore their new environment the movie gets really enjoyable... bouncing heads? well... yes ;-) <br /><br />anyway... this movie was, imho, the biggest surprise at this year's FantasyFilmFest...<br /><br />Just like in Cube and Cypher Natali gave this one a minimalistic, weird but very special design, which makes it hard to locate the place of the story or its time... timeless somehow...


In [13]:
print(text_train[16019].decode())

**May Contain Spoilers**<br /><br />A dude in a dopey-looking Kong suit (the same one used in KING KONG VS. GODZILLA in 1962) provides much of the laffs in this much-mocked monster flick. Kong is resurrected on Mondo Island and helps out the lunkhead hero and other good guys this time around. The vampire-like villain is named Dr. Who-funny, he doesn't look like Peter Cushing! Kong finally dukes it out with Who's pride and joy, a giant robot ape that looks like a bad metal sculpture of Magilla Gorilla. Like many of Honda's flicks this may have had some merit before American audiences diddled around with it and added new footage. The Rankin/Bass animation company had a hand in this mess. They should have stuck to superior children's programs like The Little Drummer Boy.


In [22]:
import xml.etree.ElementTree as ET

In [23]:
import requests

In [27]:
response = requests.get("http://www.europarl.europa.eu/meps/en/xml.html?query=full&filter=all")

In [37]:
data_xml = ET.fromstring(response.text)

In [40]:
members_xml = data_xml.getchildren()

In [65]:
members_dict = [{i.tag: i.text for i in member} for member in members_xml]
members = pd.DataFrame(members_dict)

In [66]:
members.head()

Unnamed: 0,country,fullName,id,nationalPoliticalGroup,politicalGroup
0,Sweden,Lars ADAKTUSSON,124990,Kristdemokraterna,Group of the European People's Party (Christia...
1,Italy,Isabella ADINOLFI,124831,Movimento 5 Stelle,Europe of Freedom and Direct Democracy Group
2,Italy,Marco AFFRONTE,124797,Movimento 5 Stelle,Group of the Greens/European Free Alliance
3,Italy,Laura AGEA,124811,Movimento 5 Stelle,Europe of Freedom and Direct Democracy Group
4,United Kingdom,John Stuart AGNEW,96897,United Kingdom Independence Party,Europe of Freedom and Direct Democracy Group


In [68]:
mallory = ["Do you want ants?",
           "Because that’s how you get ants."]

In [71]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(mallory)
print(vect.get_feature_names())

['ants', 'because', 'do', 'get', 'how', 'that', 'want', 'you']


In [74]:
X = vect.transform(mallory)
X

<2x8 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [76]:
X.toarray()

array([[1, 0, 1, 0, 0, 0, 1, 1],
       [1, 1, 0, 1, 1, 1, 0, 1]])

In [81]:
print(mallory)
print(vect.inverse_transform(X)[0])
print(vect.inverse_transform(X)[1])

['Do you want ants?', 'Because that’s how you get ants.']
['ants' 'do' 'want' 'you']
['ants' 'because' 'get' 'how' 'that' 'you']


# Classification example

In [105]:
from sklearn.datasets import load_files
reviews_train = load_files("../data/aclImdb/train/")

text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("class balance: {}".format(np.bincount(y_train)))
print("text_train[1]:\n{}".format(text_train[1]))

type of text_train: <class 'list'>
length of text_train: 25000
class balance: [12500 12500]
text_train[1]:
b'Words can\'t describe how bad this movie is. I can\'t explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There are so many clich\xc3\xa9s, mistakes (and all other negative things you can imagine) here that will just make you cry. To start with the technical first, there are a LOT of mistakes regarding the airplane. I won\'t list them here, but just mention the coloring of the plane. They didn\'t even manage to show an airliner in the colors of a fictional airline, but instead used a 747 painted in the original Boeing livery. Very bad. The plot is stupid and has been done many times before, only much, much better. There are so many ridiculous moments here that i lost count of it really early. Also, I was on the bad guys\' side all the time in the movie, because the good guys were 

In [107]:
text_train_sub, text_val, y_train_sub, y_val = train_test_split(
    text_train, y_train, stratify=y_train, random_state=0)
vect = CountVectorizer()
X_train = vect.fit_transform(text_train_sub)
X_val = vect.transform(text_val)

In [108]:
X_train

<18750x66651 sparse matrix of type '<class 'numpy.int64'>'
	with 2580448 stored elements in Compressed Sparse Row format>

In [None]:
feature_names = vect.get_feature_names()
print(feature_names[:10])
print(feature_names[::2000])

In [111]:
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV(solver="sag").fit(X_train, y_train_sub)



1 loop, best of 3: 1min 9s per loop


In [112]:
%timeit lr = LogisticRegressionCV().fit(X_train, y_train_sub)

1 loop, best of 3: 43.2 s per loop


In [None]:
%timeit lr = LogisticRegressionCV(solver='liblinear').fit(X_train, y_train_sub)

# Vectorization options

In [84]:
print(vect.token_pattern)

(?u)\b\w\w+\b


In [91]:
vect = CountVectorizer(token_pattern=r"\b\w+\b")
vect.fit(mallory)
print(vect.get_feature_names())

['ants', 'because', 'do', 'get', 'how', 's', 'that', 'want', 'you']


In [93]:
vect = CountVectorizer(token_pattern=r"\b\w[\w’]+\b")
# not actually an apostroph but some unicode pattern
# because I copy & pasted the quote
vect.fit(mallory)
print(vect.get_feature_names())

['ants', 'because', 'do', 'get', 'how', 'that’s', 'want', 'you']


In [95]:
vect = CountVectorizer(stop_words='english')
vect.fit(mallory)
print(vect.get_feature_names())

['ants', 'want']


In [99]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print(list(ENGLISH_STOP_WORDS))

['there', 'else', 'two', 'perhaps', 'get', 'inc', 'find', 'interest', 'between', 'give', 'amongst', 'however', 'former', 'nine', 'please', 'us', 'about', 'almost', 'but', 'thereupon', 'call', 'ie', 'third', 'whereby', 'whole', 'whose', 'one', 'afterwards', 'only', 'somehow', 'is', 'eight', 'nothing', 'an', 'with', 'describe', 'than', 'itself', 'do', 'thin', 'cry', 'hundred', 'its', 'latterly', 'formerly', 'name', 'no', 'via', 'hereupon', 'well', 'system', 'so', 'un', 'mill', 'neither', 'she', 'seems', 'or', 'though', 'against', 'wherever', 'very', 'within', 'con', 'during', 'whom', 'per', 'front', 'much', 'sometimes', 'ten', 'next', 'those', 'anyhow', 'fill', 'became', 'along', 'never', 'this', 'that', 'our', 'all', 'be', 'may', 'made', 'should', 'for', 'keep', 'onto', 'below', 'here', 'been', 'of', 'once', 'themselves', 'whereas', 'three', 'hereby', 'several', 'how', 'even', 'whither', 'her', 'herself', 'other', 'will', 'around', 'a', 'seem', 'because', 'it', 'across', 'take', 'enough

In [100]:
vect = CountVectorizer(min_df=2)
vect.fit(mallory)
print(vect.get_feature_names())

['ants', 'you']


In [102]:
vect = CountVectorizer(max_features=4)
vect.fit(mallory)
print(vect.get_feature_names())

['ants', 'because', 'do', 'you']


In [None]:
vect = CountVectorizer(min_df=2)
X_train_df2 = vect.fit_transform(text_train_sub)
X_val_df2.shape = vect.transform(text_val)
print(X_train.shape)
print(X_train_df2.shape)

# n-grams

In [None]:
cv = CountVectorizer(ngram_range=(1, 1)).fit(mallory)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

In [None]:
cv = CountVectorizer(ngram_range=(2, 2)).fit(mallory)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

In [None]:
cv = CountVectorizer(ngram_range=(1, 2)).fit(mallory)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

# Character n-grams

In [None]:
cv = CountVectorizer(ngram_range=(2, 3), analyzer="charwb").fit(mallory)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))