In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [2]:
categorical_feature = pd.Series(['sunny', 'cloudy', 'snowy', 'foggy'])

In [3]:
mapping = pd.get_dummies(categorical_feature)

In [4]:
mapping

Unnamed: 0,cloudy,foggy,snowy,sunny
0,0,0,0,1
1,1,0,0,0
2,0,0,1,0
3,0,1,0,0


In [5]:
le = LabelEncoder()
ohe = OneHotEncoder()

In [6]:
levels = ['sunny', 'cloudy', 'snowy', 'rainy', 'foggy']

In [7]:
fit_levels = le.fit_transform(levels)

In [8]:
ohe.fit([[fit_levels[0]], [fit_levels[1]], [fit_levels[2]], [fit_levels[3]], [fit_levels[4]]])

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [9]:
ohe.transform([le.transform(['sunny'])]).toarray()

array([[0., 0., 0., 0., 1.]])

In [10]:
ohe.transform([le.transform(['cloudy'])]).toarray()

array([[1., 0., 0., 0., 0.]])

#### working with text

In [11]:
from sklearn.datasets import fetch_20newsgroups

In [12]:
categories = ['sci.med', 'sci.space']

In [13]:
twenty_sci_news = fetch_20newsgroups(categories=categories)

In [14]:
twenty_sci_news.filenames

array(['/home/alex/scikit_learn_data/20news_home/20news-bydate-train/sci.space/61116',
       '/home/alex/scikit_learn_data/20news_home/20news-bydate-train/sci.med/58122',
       '/home/alex/scikit_learn_data/20news_home/20news-bydate-train/sci.med/58903',
       ...,
       '/home/alex/scikit_learn_data/20news_home/20news-bydate-train/sci.space/60774',
       '/home/alex/scikit_learn_data/20news_home/20news-bydate-train/sci.space/60954',
       '/home/alex/scikit_learn_data/20news_home/20news-bydate-train/sci.med/58911'],
      dtype='<U91')

In [15]:
twenty_sci_news.target

array([1, 0, 0, ..., 1, 1, 0])

In [16]:
twenty_sci_news.target_names

['sci.med', 'sci.space']

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
count_vect = CountVectorizer()
word_count = count_vect.fit_transform(twenty_sci_news.data)

In [19]:
word_count.shape

(1187, 25638)

In [20]:
print(word_count[0])

  (0, 10778)	1
  (0, 23849)	1
  (0, 9796)	1
  (0, 12716)	1
  (0, 18586)	1
  (0, 13384)	1
  (0, 5134)	1
  (0, 10785)	1
  (0, 15246)	1
  (0, 11330)	1
  (0, 5148)	1
  (0, 13318)	1
  (0, 18744)	1
  (0, 20110)	1
  (0, 18642)	1
  (0, 3808)	2
  (0, 10188)	1
  (0, 6017)	3
  (0, 24930)	1
  (0, 18474)	1
  (0, 23241)	1
  (0, 23129)	1
  (0, 3191)	1
  (0, 12362)	1
  (0, 15968)	1
  :	:
  (0, 7646)	1
  (0, 24547)	1
  (0, 24415)	1
  (0, 13359)	1
  (0, 20909)	1
  (0, 17235)	1
  (0, 24151)	1
  (0, 13158)	1
  (0, 24626)	1
  (0, 17217)	1
  (0, 8438)	1
  (0, 21686)	2
  (0, 5650)	3
  (0, 10713)	1
  (0, 3233)	1
  (0, 21382)	1
  (0, 23137)	7
  (0, 24461)	1
  (0, 22345)	1
  (0, 23381)	2
  (0, 4762)	2
  (0, 10341)	1
  (0, 17170)	1
  (0, 10501)	2
  (0, 10827)	2


In [21]:
word_list = count_vect.get_feature_names()
for n in word_count[0].indices:
    print('word "%s" appears %i times' % (word_list[n], word_count[0, n]))

word "fred" appears 1 times
word "twilight" appears 1 times
word "evening" appears 1 times
word "in" appears 1 times
word "presence" appears 1 times
word "its" appears 1 times
word "blare" appears 1 times
word "freely" appears 1 times
word "may" appears 1 times
word "god" appears 1 times
word "blessed" appears 1 times
word "is" appears 1 times
word "profiting" appears 1 times
word "right" appears 1 times
word "priesthood" appears 1 times
word "and" appears 2 times
word "farming" appears 1 times
word "caste" appears 3 times
word "warrior" appears 1 times
word "practiced" appears 1 times
word "those" appears 1 times
word "than" appears 1 times
word "activities" appears 1 times
word "human" appears 1 times
word "more" appears 1 times
word "are" appears 1 times
word "there" appears 1 times
word "that" appears 1 times
word "remember" appears 1 times
word "to" appears 1 times
word "try" appears 1 times
word "please" appears 1 times
word "age" appears 1 times
word "bronze" appears 1 times
wor

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vect = TfidfVectorizer(use_idf=False, norm='l1')
word_freq = tf_vect.fit_transform(twenty_sci_news.data)
word_list = tf_vect.get_feature_names()
for n in word_freq[0].indices:
    print('word "%s" has frequency %0.3f' % (word_list[n], word_freq[0, n]))

word "fred" has frequency 0.011
word "twilight" has frequency 0.011
word "evening" has frequency 0.011
word "in" has frequency 0.011
word "presence" has frequency 0.011
word "its" has frequency 0.011
word "blare" has frequency 0.011
word "freely" has frequency 0.011
word "may" has frequency 0.011
word "god" has frequency 0.011
word "blessed" has frequency 0.011
word "is" has frequency 0.011
word "profiting" has frequency 0.011
word "right" has frequency 0.011
word "priesthood" has frequency 0.011
word "and" has frequency 0.022
word "farming" has frequency 0.011
word "caste" has frequency 0.033
word "warrior" has frequency 0.011
word "practiced" has frequency 0.011
word "those" has frequency 0.011
word "than" has frequency 0.011
word "activities" has frequency 0.011
word "human" has frequency 0.011
word "more" has frequency 0.011
word "are" has frequency 0.011
word "there" has frequency 0.011
word "that" has frequency 0.011
word "remember" has frequency 0.011
word "to" has frequency 0.0

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
word_tfidf = tfidf_vect.fit_transform(twenty_sci_news.data)
word_list = tfidf_vect.get_feature_names()
for n in word_freq[0].indices:
    print('word "%s" has tf-idf %0.3f' % (word_list[n], word_tfidf[0, n]))

word "fred" has tf-idf 0.089
word "twilight" has tf-idf 0.139
word "evening" has tf-idf 0.113
word "in" has tf-idf 0.024
word "presence" has tf-idf 0.119
word "its" has tf-idf 0.061
word "blare" has tf-idf 0.150
word "freely" has tf-idf 0.119
word "may" has tf-idf 0.054
word "god" has tf-idf 0.119
word "blessed" has tf-idf 0.150
word "is" has tf-idf 0.026
word "profiting" has tf-idf 0.150
word "right" has tf-idf 0.068
word "priesthood" has tf-idf 0.144
word "and" has tf-idf 0.049
word "farming" has tf-idf 0.144
word "caste" has tf-idf 0.433
word "warrior" has tf-idf 0.144
word "practiced" has tf-idf 0.132
word "those" has tf-idf 0.060
word "than" has tf-idf 0.052
word "activities" has tf-idf 0.091
word "human" has tf-idf 0.084
word "more" has tf-idf 0.046
word "are" has tf-idf 0.035
word "there" has tf-idf 0.039
word "that" has tf-idf 0.027
word "remember" has tf-idf 0.077
word "to" has tf-idf 0.023
word "try" has tf-idf 0.073
word "please" has tf-idf 0.071
word "age" has tf-idf 0.092


#### ngrams

In [29]:
text_1 = "we love data science"
text_2 = "data science is hard"
documents = [text_1, text_2]

In [30]:
documents

['we love data science', 'data science is hard']

In [34]:
count_vect_1_grams = CountVectorizer(ngram_range=(1, 1), stop_words=[], min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print('word list =', word_list)
print('text_1 is described with', [word_list[n] + '(' + str(word_count[0, n]) + ')' for n in word_count[0].indices])

word list = ['data', 'hard', 'is', 'love', 'science', 'we']
text_1 is described with ['science(1)', 'data(1)', 'love(1)', 'we(1)']


In [35]:
count_vect_2_grams = CountVectorizer(ngram_range=(2, 2))
word_count = count_vect_2_grams.fit_transform(documents)
word_list = count_vect_2_grams.get_feature_names()
print('word list =', word_list)
print('text_1 is described with', [word_list[n] + '(' + str(word_count[0, n]) + ')' for n in word_count[0].indices])

word list = ['data science', 'is hard', 'love data', 'science is', 'we love']
text_1 is described with ['data science(1)', 'love data(1)', 'we love(1)']


In [36]:
count_vect_12_grams = CountVectorizer(ngram_range=(1, 2))
word_count = count_vect_12_grams.fit_transform(documents)
word_list = count_vect_12_grams.get_feature_names()
print('word list =', word_list)
print('text_1 is described with', [word_list[n] + '(' + str(word_count[0, n]) + ')' for n in word_count[0].indices])

word list = ['data', 'data science', 'hard', 'is', 'is hard', 'love', 'love data', 'science', 'science is', 'we', 'we love']
text_1 is described with ['data science(1)', 'love data(1)', 'we love(1)', 'science(1)', 'data(1)', 'love(1)', 'we(1)']


In [37]:
from sklearn.feature_extraction.text import HashingVectorizer
hash_vect = HashingVectorizer(n_features=1000)
word_hashed = hash_vect.fit_transform(twenty_sci_news.data)
word_hashed.shape

(1187, 1000)

#### scraping the web

In [48]:
import urllib.request

In [49]:
url = 'https://en.wikipedia.org/wiki/William_Shakespeare'

In [50]:
request = urllib.request.Request(url)

In [51]:
response = urllib.request.urlopen(request)

In [52]:
from bs4 import BeautifulSoup

In [53]:
soup = BeautifulSoup(response, 'html.parser')

In [54]:
soup.title

<title>William Shakespeare - Wikipedia</title>

In [55]:
section = soup.find_all(id='mw-normal-catlinks')[0]
for catlink in section.find_all('a')[1:]:
    print(catlink.get('title'), '->', catlink.get('href'))

Category:Sonnets by William Shakespeare -> /wiki/Category:Sonnets_by_William_Shakespeare
Category:William Shakespeare -> /wiki/Category:William_Shakespeare
Category:1564 births -> /wiki/Category:1564_births
Category:1616 deaths -> /wiki/Category:1616_deaths
Category:16th-century English male actors -> /wiki/Category:16th-century_English_male_actors
Category:English male stage actors -> /wiki/Category:English_male_stage_actors
Category:16th-century English writers -> /wiki/Category:16th-century_English_writers
Category:17th-century English writers -> /wiki/Category:17th-century_English_writers
Category:16th-century dramatists and playwrights -> /wiki/Category:16th-century_dramatists_and_playwrights
Category:17th-century English dramatists and playwrights -> /wiki/Category:17th-century_English_dramatists_and_playwrights
Category:16th-century English poets -> /wiki/Category:16th-century_English_poets
Category:Burials in Warwickshire -> /wiki/Category:Burials_in_Warwickshire
Category:Peopl