#Load a dataset from sklearn

In [0]:
from sklearn.datasets import load_iris
iris_dataset = load_iris()

print(iris_dataset.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

# Explore dataset

In [0]:
print('feature names', iris_dataset.feature_names)
print('label names', iris_dataset.target_names)

#get features and labels
features = iris_dataset['data']
labels = iris_dataset['target']

print('feature shape', features.shape)
print('label shape', labels.shape)

print('the features of the first 5 instances:')
print(features[:5])

print('the labels of the first 5 instances:')
print(labels[:5])

feature names ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
label names ['setosa' 'versicolor' 'virginica']
feature shape (150, 4)
label shape (150,)
the features of the first 5 instances:
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
the labels of the first 5 instances:
[0 0 0 0 0]


# Train-valid-test split

In [0]:
train_set_size = int(len(features)*0.7)
valid_set_size = int(len(features)*0.1)
test_set_size = int(len(features)*0.2)

from sklearn.model_selection import train_test_split

X_trainvalid, X_test, y_trainvalid, y_test = train_test_split(features, labels, test_size=test_set_size, random_state=11)
X_train, X_valid, y_train, y_valid = train_test_split(X_trainvalid, y_trainvalid, test_size=valid_set_size, random_state=11)

print('training set size', len(X_train))
print('valid set size', len(X_valid))
print('test set size', len(X_test))



training set size 105
valid set size 15
test set size 30


# Training and testing

Recall that we use the training set for training the model.

We use the validation set for tuning the hyperparameters.

When the model is finalized, we apply the model to the test set and evaluate

In [0]:
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score

#we will talk about different models in the following weeks

#create an instance of the RidgeClassifier model 
model = RidgeClassifier(random_state=11)
print(model)

#use the training set to train the model 
model.fit(X_train, y_train)

#how is the performance on the validation set?

y_predict_valid = model.predict(X_valid)
print ('accuracy in the validation set', accuracy_score(y_valid, y_predict_valid))

#how is the performance on the test set?
y_predict_test = model.predict(X_test)
print ('accuracy in the test set', accuracy_score(y_test, y_predict_test))


RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=11, solver='auto',
                tol=0.001)
accuracy in the validation set 0.8666666666666667
accuracy in the test set 0.6666666666666666


The accuracy of the test set is significantly lower than the accuracy of the validation set. What to do then? 

Common ways are (1) tuning parameters, (2) adding regularizations, (3) analyzing errors and (4) changing to different models.
We will go through these in much more depth in the following weeks

# Bag-of-word models

In [0]:
docs = ['this is a general news', 'this is a sports news', 'this is another news on sports', 'these news examples are so boring']

for doc in docs:
  print(doc)

this is a general news
this is a sports news
this is another news on sports
these news examples are so boring


In [0]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

#you can do preprocessing first and then pass the processed texts to CountVectorizer

freq_vectorizer = CountVectorizer(stop_words='english')
freq_vectors = freq_vectorizer.fit_transform(docs)

print(freq_vectorizer.get_feature_names())
print(freq_vectors.toarray())

freqs = pd.DataFrame(data=freq_vectors.toarray(), columns=freq_vectorizer.get_feature_names())

print(freqs)

['boring', 'examples', 'general', 'news', 'sports']
[[0 0 1 1 0]
 [0 0 0 1 1]
 [0 0 0 1 1]
 [1 1 0 1 0]]
   boring  examples  general  news  sports
0       0         0        1     1       0
1       0         0        0     1       1
2       0         0        0     1       1
3       1         1        0     1       0


The word 'news' is not a stopword, but it occurs in every document in the collection. Is it informative at all?

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

#note that we often use normalization in this step. Here I set norm to None for demonstration purpose
#there are many vairations of tf-idfs, which are all reasonable in practice.
#please see the spec in https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
tfidf_vectorizer = TfidfVectorizer(stop_words='english', norm=None)
tfidf_vectors = tfidf_vectorizer.fit_transform(docs)

tfidfs = pd.DataFrame(data=tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names())

print(tfidfs)

     boring  examples   general  news    sports
0  0.000000  0.000000  1.916291   1.0  0.000000
1  0.000000  0.000000  0.000000   1.0  1.510826
2  0.000000  0.000000  0.000000   1.0  1.510826
3  1.916291  1.916291  0.000000   1.0  0.000000
