In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, KFold

churn_df = pd.read_csv('../resources/telecom_churn_clean.csv')

## scikit-learn requirements
- Numeric data
- No missing values
- With real-world data
    - rarely the case
    - often there is a need to reprocess data first

### Dealing with categorical features
- scikit-learn will not accept categorical features by default
- Need to convert categorical features into numeric values
- Convert to binary features called dummy variables
    - 0: Observation was not that category
    - 1: Observation was that category

#### Creating dummy variables
Being able to include categorical features in the model building process can enhance performance as they may add information that contributes to prediction accuracy.

In [26]:
music = pd.read_csv('../resources/music_clean.csv')
#music_df.head()
# create the categories (only 1 in category, in training it has various categorical values)

music_df = music

categories = ['Jazz', 'Rap', 'Electronic', 'Rock', 'Classical', 'Blues', 'Anime', 'Country', 'Alternative', 'Hip-Hop']

music_df['genre'] = np.random.choice(categories, size=len(music_df))
print(music_df.head())


   Unnamed: 0  popularity  acousticness  danceability  duration_ms  energy  \
0       36506        60.0      0.896000         0.726     214547.0   0.177   
1       37591        63.0      0.003840         0.635     190448.0   0.908   
2       37658        59.0      0.000075         0.352     456320.0   0.956   
3       36060        54.0      0.945000         0.488     352280.0   0.326   
4       35710        55.0      0.245000         0.667     273693.0   0.647   

   instrumentalness  liveness  loudness  speechiness    tempo  valence  \
0          0.000002    0.1160   -14.824       0.0353   92.934    0.618   
1          0.083400    0.2390    -4.795       0.0563  110.012    0.637   
2          0.020300    0.1250    -3.634       0.1490  122.897    0.228   
3          0.015700    0.1190   -12.020       0.0328  106.063    0.323   
4          0.000297    0.0633    -7.787       0.0487  143.995    0.300   

     genre  
0    Anime  
1     Jazz  
2    Blues  
3     Rock  
4  Hip-Hop  


In [27]:
# Create music_dummies
music_dummies = pd.get_dummies(music_df, drop_first=True)

# Print the new DataFrame's shape
print("Shape of music_dummies: {}".format(music_dummies.shape))
print(music_dummies.head())

Shape of music_dummies: (1000, 21)
   Unnamed: 0  popularity  acousticness  danceability  duration_ms  energy  \
0       36506        60.0      0.896000         0.726     214547.0   0.177   
1       37591        63.0      0.003840         0.635     190448.0   0.908   
2       37658        59.0      0.000075         0.352     456320.0   0.956   
3       36060        54.0      0.945000         0.488     352280.0   0.326   
4       35710        55.0      0.245000         0.667     273693.0   0.647   

   instrumentalness  liveness  loudness  speechiness  ...  valence  \
0          0.000002    0.1160   -14.824       0.0353  ...    0.618   
1          0.083400    0.2390    -4.795       0.0563  ...    0.637   
2          0.020300    0.1250    -3.634       0.1490  ...    0.228   
3          0.015700    0.1190   -12.020       0.0328  ...    0.323   
4          0.000297    0.0633    -7.787       0.0487  ...    0.300   

   genre_Anime  genre_Blues  genre_Classical  genre_Country  genre_Electron

In [24]:
music_df

Unnamed: 0.1,Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genre
0,36506,60.0,0.896000,0.726,214547.0,0.1770,0.000002,0.1160,-14.824,0.0353,92.934,0.6180,Hip-Hop
1,37591,63.0,0.003840,0.635,190448.0,0.9080,0.083400,0.2390,-4.795,0.0563,110.012,0.6370,Electronic
2,37658,59.0,0.000075,0.352,456320.0,0.9560,0.020300,0.1250,-3.634,0.1490,122.897,0.2280,Country
3,36060,54.0,0.945000,0.488,352280.0,0.3260,0.015700,0.1190,-12.020,0.0328,106.063,0.3230,Blues
4,35710,55.0,0.245000,0.667,273693.0,0.6470,0.000297,0.0633,-7.787,0.0487,143.995,0.3000,Country
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,44501,57.0,0.972000,0.193,208040.0,0.0329,0.929000,0.0978,-28.228,0.0460,82.165,0.0366,Classical
996,25114,56.0,0.005790,0.939,144453.0,0.3730,0.000000,0.2740,-7.779,0.2270,119.953,0.0602,Rock
997,46896,54.0,0.016100,0.739,238339.0,0.5390,0.000000,0.2350,-9.735,0.3370,85.082,0.8350,Country
998,45135,62.0,0.326000,0.515,286707.0,0.5050,0.000000,0.1020,-5.606,0.0294,150.063,0.5380,Hip-Hop
