In [2]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import numpy as np

In [3]:
#load dataset
music_df=pd.read_csv('music_genre.csv')
music_df=music_df[:10000]
music_df.head()
music_df.__len__

<bound method DataFrame.__len__ of       instance_id           artist_name            track_name  popularity  \
0         32894.0              Röyksopp  Röyksopp's Night Out        27.0   
1         46652.0  Thievery Corporation      The Shining Path        31.0   
2         30097.0        Dillon Francis             Hurricane        28.0   
3         62177.0              Dubloadz                 Nitro        34.0   
4         24907.0           What So Not      Divide & Conquer        32.0   
...           ...                   ...                   ...         ...   
9995      52940.0     POLKADOT STINGRAY                   有頂天        39.0   
9996      34233.0              flumpool      Hoshi Ni Negaiwo        33.0   
9997      78473.0          Kevin Penkin                Mirage        21.0   
9998      87518.0       BUMP OF CHICKEN                 ハルジオン        44.0   
9999      41509.0            Hachioji P  Yeah! Yeah!! Yeah!!!        14.0   

      acousticness  danceability  durati


Got It!
1. Preprocessing data
Welcome to the final chapter of the course!

2. scikit-learn requirements
Recall that scikit-learn requires numeric data, with no missing values. All the data that we have used so far has been in this format. However, with real-world data, this will rarely be the case, and instead we need to preprocess our data before we can build models.

3. Dealing with categorical features
Say we have a dataset containing categorical features, such as color. As these are not numeric, scikit-learn will not accept them and we need to convert them into numeric features. We achieve this by splitting the feature into multiple binary features called dummy variables, one for each category. Zero means the observation was not that category, while one means it was.

4. Dummy variables
Say we are working with a music dataset that has a genre feature with ten values such as Electronic, Hip-Hop, and Rock.

5. Dummy variables
We create binary features for each genre. As each song has one genre, each row will have a 1 in one of the ten columns and zeros in the rest. If a song is not any of the first nine genres, then implicitly, it is a rock song. That means we only need nine features, so we can

6. Dummy variables
delete the Rock column. If we do not do this, we are duplicating information, which might be an issue for some models.

7. Dealing with categorical features in Python
To create dummy variables we can use scikit-learn's OneHotEncoder, or pandas' get_dummies. We will use get_dummies.

8. Music dataset
We will be working with a music dataset in this chapter, for both classification and regression problems. Initially, we will build a regression model using all features in the dataset to predict song popularity. There is one categorical feature, genre, with ten possible values.

9. EDA w/ categorical feature
This box plot shows how popularity varies by genre. Let's encode this feature using dummy variables.

10. Encoding dummy variables
We import pandas, read in the DataFrame, and call pd-dot-get_dummies, passing the categorical column. As we only need to keep nine out of our ten binary features, we can set the drop_first argument to True. Printing the first five rows, we see pandas creates nine new binary features. The first song is Jazz, and the second is Rap, indicated by a 1 in the respective columns. To bring these binary features back into our original DataFrame we can use pd-dot-concat, passing a list containing the music DataFrame and our dummies DataFrame, and setting axis equal to one. Lastly, we can remove the original genre column using df-dot-drop, passing the column, and setting axis equal to one.

11. Encoding dummy variables
If the DataFrame only has one categorical feature, we can pass the entire DataFrame, thus skipping the step of combining variables. If we don't specify a column, the new DataFrame's binary columns will have the original feature name prefixed, so they will start with genre-underscore - as shown here. Notice the original genre column is automatically dropped. Once we have dummy variables, we can fit models as before.

12. Linear regression with dummy variables
Using the music_dummies DataFrame, the process for creating training and test sets remains unchanged. To perform cross-validation we then create a KFold object, instantiate a linear regression model, and call cross_val_score. We set scoring equal to neg_mean_squared_error, which returns the negative MSE. This is because scikit-learn's cross-validation metrics presume a higher score is better, so MSE is changed to negative to counteract this. We can calculate the training RMSE by taking the square root and converting to positive, achieved by calling numpy-dot-square-root and passing our scores with a minus sign in front.

13. Let's practice!
Now let's practice working with categorical features.

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import KFold
# Create music_dummies
music_dummies = pd.get_dummies(music_df,drop_first=True)
# Print the new DataFrame's shape
print("Shape of music_dummies: {}".format(music_dummies.shape))
# Create X and y
X = music_dummies.drop('popularity',axis=1).values
y = music_dummies['popularity'].values
kf=KFold(n_splits=6,shuffle=True)
# Instantiate a ridge model
ridge = Ridge(alpha=.2)

# Perform cross-validation
scores = cross_val_score(ridge, X, y, cv=kf, scoring="neg_mean_squared_error")

# Calculate RMSE
rmse = np.sqrt(-scores)
print("Average RMSE: {}".format(np.mean(rmse)))
print("Standard Deviation of the target array: {}".format(np.std(y)))

In [6]:
music_df.isna().sum()

instance_id         0
artist_name         0
track_name          0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
obtained_date       0
valence             0
music_genre         0
dtype: int64


Got It!
1. Handling missing data
Now let's look at how to handle missing data.

2. Missing data
When there is no value for a feature in a particular row, we call it missing data. This can happen because there was no observation or the data might be corrupt. Whatever the reason, we need to deal with it.

3. Music dataset
Previously we worked with a modified music dataset. Now let's inspect the original version, which contains one thousand rows. We do this by chaining pandas' dot-isna with dot-sum and dot-sort_values. Each feature is missing between 8 and 200 values!

4. Dropping missing data
A common approach is to remove missing observations accounting for less than 5% of all data. To do this, we use pandas' dot-dropna method, passing a list of columns with less than 5% missing values to the subset argument. If there are missing values in our subset column, the entire row is removed. Rechecking the DataFrame, we see fewer missing values.

5. Imputing values
Another option is to impute missing data. This means making an educated guess as to what the missing values could be. We can impute the mean of all non-missing entries for a given feature. We can also use other values like the median. For categorical values we commonly impute the most frequent value. Note we must split our data before imputing to avoid leaking test set information to our model, a concept known as data leakage.

6. Imputation with scikit-learn
Here is a workflow for imputation to predict song popularity. We import SimpleImputer from sklearn-dot-impute. As we will use different imputation methods for categorical and numeric features, we first split them, storing as X_cat and X_num respectively, along with our target array as y. We create categorical training and test sets. We repeat this for the numeric features. By using the same value for the random_state argument, the target arrays' values remain unchanged. To impute missing categorical values we instantiate a SimpleImputer, setting strategy as most frequent. By default, SimpleImputer expects NumPy-dot-NaN to represent missing values. Now we call dot-fit_transform to impute the training categorical features' missing values! For the test categorical features, we call dot-transform.

7. Imputation with scikit-learn
For our numeric data, we instantiate another imputer. By default, it fills values with the mean. We fit and transform the training features, and transform the test features. We then combine our training data using numpy-dot-append, passing our two arrays, and set axis equal to 1. We repeat this for our test data. Due to their ability to transform our data, imputers are known as transformers.

8. Imputing within a pipeline
We can also impute using a pipeline, which is an object used to run a series of transformations and build a model in a single workflow. To do this, we import Pipeline from sklearn-dot-pipeline. Here we perform binary classification to predict whether a song is rock or another genre. We drop missing values accounting for less than five percent of our data. We convert values in the genre column, which will be the target, to a 1 if Rock, else 0, using numpy-dot-where. We then create X and y.

9. Imputing within a pipeline
To build a pipeline we construct a list of steps containing tuples with the step names specified as strings, and instantiate the transformer or model. We pass this list when instantiating a Pipeline. We then split our data, and fit the pipeline to the training data, as with any other model. Finally, we compute accuracy. Note that, in a pipeline, each step but the last must be a transformer.

10. Let's practice!
Now let's create a pipeline to handle missing data and build a model!

In [None]:
# Print missing values for each column
print(music_df.isna().sum().sort_values())

# Remove values where less than 5% are missing
music_df = music_df.dropna(subset=["genre", "popularity", "loudness", "liveness", "tempo"])

# Convert genre to a binary feature
music_df["genre"] = np.where(music_df["genre"] == "Rock", 1, 0)

print(music_df.isna().sum().sort_values())
print("Shape of the `music_df`: {}".format(music_df.shape))

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
X_cat = music_df["genre"].values.reshape(-1, 1)
X_num = music_df.drop(["genre", "popularity"], axis=1).values
y = music_df["popularity"].values
X_train_cat, X_test_cat, y_train, y_test = train_test_split(X_cat, y, test_size=0.2,random_state=12)    
X_train_num, X_test_num, y_train, y_test = train_test_split(X_num, y, test_size=0.2,random_state=12)
imp_cat = SimpleImputer(strategy="most_frequent")
X_train_cat = imp_cat.fit_transform(X_train_cat)
X_test_cat = imp_cat.transform(X_test_cat)

imp_num = SimpleImputer()
X_train_num = imp_num.fit_transform(X_train_num)
X_test_num = imp_num.transform(X_test_num)
X_train = np.append(X_train_num, X_train_cat, axis=1)
X_test = np.append(X_test_num, X_test_cat, axis=1)
#Imputersareknownastransformers
#IMPUTING IN PYTHON

In [8]:
# with a pipeline 
# Import modules
from sklearn.impute import SimpleImputer 
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
# Instantiate an imputer
imputer = SimpleImputer()

# Instantiate a knn model
knn = KNeighborsClassifier(n_neighbors=3)

# Build steps for the pipeline
steps = [("imputer", imputer), 
         ("knn", knn)]

In [None]:

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=.3,random_state=21)
imp_mean=SimpleImputer(strategy='mean')
from sklearn.metrics import confusion_matrix
steps = [("imputer", imp_mean),
        ("knn", knn)]

# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the training data
pipeline.fit(X_train,y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Print the confusion matrix
print(confusion_matrix(y_test,y_pred))


Got It!
1. Centering and scaling
Data imputation is one of several important preprocessing steps for machine learning. Now let's cover another: centering and scaling our data.

2. Why scale our data?
Let's use df-dot-describe to check out the ranges of some of our feature variables in the music dataset. We see that the ranges vary widely: duration_ms ranges from zero to one-point-six-two million, speechiness contains only decimal places, and loudness only has negative values!

3. Why scale our data?
Many machine learning models use some form of distance to inform them, so if we have features on far larger scales, they can disproportionately influence our model. For example, KNN uses distance explicitly when making predictions. For this reason, we actually want features to be on a similar scale. To achieve this, we can normalize or standardize our data, often referred to as scaling and centering.

4. How to scale our data
There are several ways to scale our data: given any column, we can subtract the mean and divide by the variance so that all features are centered around zero and have a variance of one. This is called standardization. We can also subtract the minimum and divide by the range of the data so the normalized dataset has minimum zero and maximum one. Or, we can center our data so that it ranges from -1 to 1 instead. In this video, we will perform standardization, but scikit-learn has functions available for other types of scaling.

5. Scaling in scikit-learn
To scale our features, we import StandardScaler from sklearn-dot-preprocessing. We create our feature and target arrays. Before scaling, we split our data to avoid data leakage. We then instantiate a StandardScaler object, and call its fit_transform method, passing our training features. Next, we use scaler-dot-transform on the test features. Looking at the mean and standard deviation of the columns of both the original and scaled data verifies the change has taken place.

6. Scaling in a pipeline
We can also put a scaler in a pipeline! Here we build a pipeline object to scale our data and use a KNN model with six neighbors. We then split our data, fit the pipeline to our training set, and predict on our test set. Computing the accuracy yields a result of zero-point-eight-one. Let's compare this to using unscaled data.

7. Comparing performance using unscaled data
Here we fit a KNN model to our unscaled training data and print the accuracy. It is only zero-point-five-three, so just by scaling our data we improved accuracy by over 50 percent!

8. CV and scaling in a pipeline
Let's also look at how we can use cross-validation with a pipeline. We first build our pipeline. We then specify our hyperparameter space by creating a dictionary: the keys are the pipeline step name followed by a double underscore, followed by the hyperparameter name. The corresponding value is a list or an array of the values to try for that particular hyperparameter. In this case, we are tuning n_neighbors in the KNN model. Next we split our data into training and test sets. We then perform a grid search over our parameters by instantiating the GridSearchCV object, passing our pipeline and setting the param_grid argument equal to parameters. We then fit it to our training data. Lastly, we make predictions using our test set.

9. Checking model parameters
Printing GridSearchCV's best_score_ attribute, we see the score is very slightly better than our previous model's performance. Printing the best parameters, the optimal model has 12 neighbors.

10. Let's practice!
Now let's incorporate scaling into our supervised learning workflow!

In [1]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler  

# Create pipeline steps
steps = [("scaler", StandardScaler()),
         ("lasso", Lasso(alpha=.5))]

# Instantiate the pipeline
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)

# Calculate and print R-squared
print(pipeline.score(X_test, y_test))

NameError: name 'Lasso' is not defined

In [None]:
# Build the steps

steps = [("scaler", StandardScaler()),
         ("logreg", LogisticRegression())]
pipeline = Pipeline(steps)

# Create the parameter space
parameters = {"logreg__C": np.linspace(0.001, 1.0, 20)}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=21)

# Instantiate the grid search object
cv = GridSearchCV(pipeline, param_grid=parameters)

# Fit to the training data
cv.fit(X_train, y_train)
print(cv.best_score_, "\n", cv.best_params_)

#estimator.get_params().keys()


Got It!
1. Evaluating multiple models
We've covered all parts of the supervised learning workflow. But how do we decide which model to use in the first place?

2. Different models for different problems
This is a complex question, and the answer depends on our situation. However, there are some principles that can guide us when making this decision. The size of our dataset plays a role. Fewer features means a simpler model, and can reduce training time. Also, some models, such as Artificial Neural Networks, require a lot of data to perform well. We may need an interpretable model, so we can explain to stakeholders how predictions were made. An example is linear regression, where we can calculate and interpret the model coefficients. Alternatively, flexibility might be important to get the most accurate predictions. Generally, flexible models make fewer assumptions about the data; for example, a KNN model does not assume a linear relationship between the features and the target.

3. It's all in the metrics
Notice that scikit-learn allows the same methods to be used for most models. This makes it easy to compare them! Regression models can be evaluated using the root mean squared error, or the R-squared value. Likewise, classification models can all be analyzed using accuracy, a confusion matrix and its associated metrics, or the ROC AUC. Therefore, one approach is to select several models and a metric, then evaluate their performance without any form of hyperparameter tuning.

4. A note on scaling
Recall that the performance of some models, such as KNN, linear regression, and logistic regression, are affected by scaling our data. Therefore, it is generally best to scale our data before evaluating models out of the box.

5. Evaluating classification models
We will evaluate three models for binary classification of song genre: KNN, logistic regression, and a new model called a decision tree classifier. We import our required modules, including DecisionTreeClassifier from sklearn-dot-tree. The workings of decision trees are outside the scope of this course, but the steps for building this model are the same as for other models in scikit-learn. As usual, we create our feature and target arrays, then split our data. We then scale our features using the scaler's dot-fit_transform method on the training set, and the dot-transform method on the test set.

6. Evaluating classification models
We create a dictionary with our model names as strings for the keys, and instantiate models as the dictionary's values. We also create an empty list to store the results. Now we loop through the models in our models dictionary, using its dot-values method. Inside the loop, we instantiate a KFold object. Next we perform cross-validation, using the model being iterated, along with our scaled training features, and target training array. We set cv equal to our kfold variable. By default, the scoring here will be accuracy. We then append the cross-validation results to our results list. Lastly, outside of the loop, we create a boxplot of our results, and set the labels argument equal to a call of models-dot-keys to retrieve each model's name.

7. Visualizing results
The output shows us the range of cross-validation accuracy scores. We can also see each model's median cross-validation score, represented by the orange line in each box. We can see logistic regression has the best median score.

8. Test set performance
To evaluate on the test set we loop through the names and values of the dictionary using the dot-items method. Inside the loop we fit the model, calculate accuracy, and print it. Logistic regression performs best for this problem if we are using accuracy as the metric.

9. Let's practice!
Now let's choose which models to optimize for our supervised learning problems.



In [None]:
models = {"Linear Regression": LinearRegression(), "Ridge": Ridge(alpha=0.1), "Lasso": Lasso(alpha=0.1)}
results = []

# Loop through the models' values
for model in models.values():
  kf = KFold(n_splits=6, random_state=42, shuffle=True)
  
  # Perform cross-validation
  cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
  
  # Append the results
  results.append(cv_scores)

# Create a box plot of the results
plt.boxplot(results, labels=models.keys())
plt.show()

In [None]:
# Import mean_squared_error
from sklearn.metrics import mean_squared_error

for name, model in models.items():
  
  # Fit the model to the training data
  model.fit(X_train_scaled,y_train)
  
  # Make predictions on the test set
  y_pred = model.predict(X_test_scaled)
  
  # Calculate the test_rmse
  test_rmse = mean_squared_error(y_test, y_pred, squared=False)
  print("{} Test Set RMSE: {}".format(name, test_rmse))

In [None]:
# Create models dictionary
models = {"Logistic Regression": LogisticRegression(), "KNN": KNeighborsClassifier(), "Decision Tree Classifier": DecisionTreeClassifier()}
results = []

# Loop through the models' values
for model in models.values():
  
  # Instantiate a KFold object
  kf = KFold(n_splits=6, random_state=12, shuffle=True)
  
  # Perform cross-validation
  cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kf)
  results.append(cv_results)
plt.boxplot(results, labels=models.keys())
plt.show()