In [0]:
# https://www.datacamp.com/courses/preprocessing-for-machine-learning-in-python

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
import re
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

**Course Description**

This course covers the basics of how and when to perform data preprocessing. This essential step in any machine learning project is when you get your data ready for modeling. Between importing and cleaning your data and fitting your machine learning model is when preprocessing comes into play. You'll learn how to standardize your data so that it's in the right form for your model, create new features to best leverage the information in your dataset, and select the best features to improve your model fit. Finally, you'll have some practice preprocessing by getting a dataset on UFO sightings ready for modeling.



## 1. Introduction to Data Preprocessing

In this chapter you'll learn exactly what it means to preprocess data. You'll take the first steps in any preprocessing journey, including exploring data types and dealing with missing data.

### What is data preprocessing?

In [63]:
volunteer = pd.read_csv("volunteer_opportunities.csv")
print("volunteer dimension: ", volunteer.shape) # (665, 35)
volunteer.tail()

volunteer dimension:  (665, 35)


Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,amsl,amsl_unit,org_title,org_content_id,addresses_count,locality,region,postalcode,primary_loc,display_url,recurrence_type,hours,created_date,last_modified_date,start_date_date,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
660,5640,50193,3,0,Volunteer for NYLAG's Food Stamps Project,197,"Volunteers needed to file for fair hearings, d...",,2.0,Helping Neighbors in Need,,,New York Legal Assistance Group,104,1,"7 Hanover Square\nNew York, NY 10004\n(40.7043...",NY,10004.0,,/opportunities/5640,ongoing,0,August 16 2011,August 17 2011,August 16 2011,November 15 2012,approved,,,,,,,,
661,5218,38711,10,0,Iridescent Science Studio Open House Volunteers,113,Come out to the South Bronx to help us hold ou...,,1.0,Strengthening Communities,,,Iridescent,38544,1,"890 Garrison Ave\nBronx, NY 10474\n(40.8171141...",NY,10474.0,,/opportunities/5218,onetime,0,March 21 2011,March 21 2011,April 13 2011,April 13 2011,approved,,,,,,,,
662,5541,47820,1,0,French Translator,145,Volunteer needed to translate written material...,,2.0,Helping Neighbors in Need,,,"Services for the UnderServed, Inc.",38951,1,"305 Seventh Avenue\nNew York, NY 10001\n(40.74...",NY,10001.0,,/opportunities/5541,ongoing,0,July 20 2011,August 23 2011,July 20 2011,September 01 2011,approved,,,,,,,,
663,5398,40722,2,0,Marketing & Advertising Volunteer,330,World Cares Center is looking for individuals ...,,1.0,Strengthening Communities,,,World Cares Center,36979,1,"520 8th Ave\nNY, NY 10018\n(40.75376054978079,...",NY,10018.0,,/opportunities/5398,ongoing,0,June 01 2011,August 09 2011,June 01 2011,May 31 2012,approved,,,,,,,,
664,5507,44303,5,0,Volunteer filmmakers to help Mayor's Office wi...,304,"Attention all filmmakers, producers, and edito...",,1.0,Strengthening Communities,,,Mayor's Office of Adult Education,43738,1,"100 Gold Street\nNew York, NY 10038\n(40.71053...",NY,10038.0,,/opportunities/5507,ongoing,0,July 07 2011,July 07 2011,July 07 2011,October 06 2012,approved,,,,,,,,


In [9]:
# Check how many values are missing in the category_desc column
print(volunteer["category_desc"].isnull().sum())

48


In [95]:
# Subset the volunteer dataset
volunteer_subset = volunteer[volunteer["category_desc"].notnull()]

# Print out the shape of the subset
print(volunteer_subset.shape)

(617, 37)


In [22]:
volunteer_subset.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,amsl,amsl_unit,org_title,org_content_id,addresses_count,locality,region,postalcode,primary_loc,display_url,recurrence_type,hours,created_date,last_modified_date,start_date_date,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,,,Bpeace,37026,1,"5 22nd St\nNew York, NY 10010\n(40.74053152272...",NY,10010.0,,/opportunities/5008,onetime,0,January 14 2011,January 25 2011,February 01 2011,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,,,Street Project,3001,1,,NY,10026.0,,/opportunities/5016,onetime,0,January 19 2011,January 21 2011,January 29 2011,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,,,Oxfam America,2170,1,,NY,2114.0,,/opportunities/5022,ongoing,0,January 21 2011,January 25 2011,February 14 2011,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,,,Office of Recycling Outreach and Education,36773,1,,NY,10455.0,,/opportunities/5055,onetime,0,January 28 2011,February 01 2011,February 05 2011,February 05 2011,approved,,,,,,,,
5,5056,37426,15,0,Queens Stop 'N' Swap,135,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,,,Office of Recycling Outreach and Education,36773,1,,NY,11372.0,,/opportunities/5056,onetime,0,January 28 2011,January 28 2011,February 12 2011,February 12 2011,approved,,,,,,,,


### Working with data types

In [7]:
# Print the head of the hits column
print(volunteer["hits"].head())

0    737
1     22
2     62
3     14
4     31
Name: hits, dtype: int64


In [8]:
# Convert the hits column to type int
volunteer["hits"] = volunteer["hits"].astype("int")

# Look at the dtypes of the dataset
print(volunteer.dtypes)

opportunity_id          int64
content_id              int64
vol_requests            int64
event_time              int64
title                  object
hits                    int64
summary                object
is_priority            object
category_id           float64
category_desc          object
amsl                  float64
amsl_unit             float64
org_title              object
org_content_id          int64
addresses_count         int64
locality               object
region                 object
postalcode            float64
primary_loc           float64
display_url            object
recurrence_type        object
hours                   int64
created_date           object
last_modified_date     object
start_date_date        object
end_date_date          object
status                 object
Latitude              float64
Longitude             float64
Community Board       float64
Community Council     float64
Census Tract          float64
BIN                   float64
BBL       

In [0]:
from sklearn.model_selection import train_test_split

### Class distribution

the original code has NaN

In [0]:
# Create a data with all columns except category_desc
volunteer_X = volunteer.drop("category_desc", axis=1)

# Create a category_desc labels dataset
volunteer_y = volunteer[["category_desc"]]

In [15]:
volunteer_X[:2]

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,amsl,amsl_unit,org_title,org_content_id,addresses_count,locality,region,postalcode,primary_loc,display_url,recurrence_type,hours,created_date,last_modified_date,start_date_date,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,4996,37004,50,0,Volunteers Needed For Rise Up & Stay Put! Home...,737,Building on successful events last summer and ...,,,,,Center For NYC Neighborhoods,4426,1,,NY,,,/opportunities/4996,onetime,0,January 13 2011,June 23 2011,July 30 2011,July 30 2011,approved,,,,,,,,
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,,,Bpeace,37026,1,"5 22nd St\nNew York, NY 10010\n(40.74053152272...",NY,10010.0,,/opportunities/5008,onetime,0,January 14 2011,January 25 2011,February 01 2011,February 01 2011,approved,,,,,,,,


In [17]:
volunteer_y[:5]

Unnamed: 0,category_desc
0,
1,Strengthening Communities
2,Strengthening Communities
3,Strengthening Communities
4,Environment


In [19]:
# Use stratified sampling to split up the dataset according to the volunteer_y dataset
X_train, X_test, y_train, y_test = train_test_split(volunteer_X, volunteer_y, stratify=volunteer_y)

# Print out the category_desc counts on the training y labels
print(y_train["category_desc"].value_counts())

ValueError: ignored

changed to subset df without NaN

In [0]:
# Create a data with all columns except category_desc
volunteer_X = volunteer_subset.drop("category_desc", axis=1)

# Create a category_desc labels dataset
volunteer_y = volunteer_subset[["category_desc"]]

In [24]:
# Use stratified sampling to split up the dataset according to the volunteer_y dataset
X_train, X_test, y_train, y_test = train_test_split(volunteer_X, volunteer_y, stratify=volunteer_y)

# Print out the category_desc counts on the training y labels
print(y_train["category_desc"].value_counts())

Strengthening Communities    230
Helping Neighbors in Need     89
Education                     69
Health                        39
Environment                   24
Emergency Preparedness        11
Name: category_desc, dtype: int64


## 2. Standardizing Data

This chapter is all about standardizing data. Often a model will make some assumptions about the distribution or scale of your features. Standardization is a way to make your data fit these assumptions and improve the algorithm's performance.

### Standardizing Data

In [40]:
wine = pd.read_csv("wine_types.csv")
print("wine df dimension: ", wine.shape) # (178, 14)
wine.tail()

wine df dimension:  (178, 14)


Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.7,0.64,1.74,740
174,3,13.4,3.91,2.48,23.0,102,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840
177,3,14.13,4.1,2.74,24.5,96,2.05,0.76,0.56,1.35,9.2,0.61,1.6,560


In [0]:
# match dataset
X_cols = ['Proline', 'Total phenols', 'Hue', 'Nonflavanoid phenols']
X = wine[X_cols]
y = wine['Type']

In [0]:
from sklearn.neighbors import KNeighborsClassifier

In [0]:
# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [21]:
y_train.shape

(133,)

In [22]:
X_train.shape

(133, 4)

In [0]:
knn = KNeighborsClassifier(n_neighbors=5)

In [28]:
# Fit the k-nearest neighbors model to the training data
knn.fit(X_train, y_train)

# Score the model on the test data
print(knn.score(X_test, y_test))

0.6222222222222222


### Log normalization

In [29]:
# Print out the variance of the Proline column
print(wine["Proline"].var())

# Apply the log normalization function to the Proline column
wine["Proline_log"] = np.log(wine["Proline"])

# Check the variance of the Proline column again
print(wine["Proline_log"].var())

99166.71735542428
0.17231366191842018


### Scaling data for feature comparison

In [0]:
# Import StandardScaler from scikit-learn
from sklearn.preprocessing import StandardScaler

# Create the scaler
ss = StandardScaler()

# Take a subset of the DataFrame you want to scale 
wine_subset = wine[["Ash", "Alcalinity of ash", "Magnesium"]]

# Apply the scaler to the DataFrame subset
wine_subset_scaled = ss.fit_transform(wine_subset)

In [31]:
wine_subset.head()

Unnamed: 0,Ash,Alcalinity of ash,Magnesium
0,2.43,15.6,127
1,2.14,11.2,100
2,2.67,18.6,101
3,2.5,16.8,113
4,2.87,21.0,118


In [33]:
wine_subset_scaled[:5]

array([[ 0.23205254, -1.16959318,  1.91390522],
       [-0.82799632, -2.49084714,  0.01814502],
       [ 1.10933436, -0.2687382 ,  0.08835836],
       [ 0.4879264 , -0.80925118,  0.93091845],
       [ 1.84040254,  0.45194578,  1.28198515]])

### Standardized data and modeling

In [0]:
# match dataset
y = wine['Type']
X = wine

In [42]:
X.head()

Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [43]:
del X['Type']
X.shape

(178, 13)

In [44]:
y.shape

(178,)

In [45]:
# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train, y_train)

# Score the model on the test data
print(knn.score(X_test, y_test))

0.7111111111111111


In [46]:
# Create the scaling method.
ss = StandardScaler()

# Apply the scaling method to the dataset used for modeling.
X_scaled = ss.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

# Fit the k-nearest neighbors model to the training data.
knn.fit(X_train, y_train)

# Score the model on the test data.
print(knn.score(X_test, y_test))

0.9777777777777777


## 3. Feature Engineering

In this section you'll learn about feature engineering. You'll explore different ways to create new, more useful, features from the ones already in your dataset. You'll see how to encode, aggregate, and extract information from both numerical and textual features.

In [0]:
import json

with open("hiking.json", "r") as read_file:
  hike = json.load(read_file)
  
len(hike)

In [76]:
hiking = pd.read_json("hiking.json")
print("hiking dataframe dimension: ", hiking.shape)
hiking

hiking dataframe dimension:  (33, 11)


Unnamed: 0,Accessible,Difficulty,Length,Limited_Access,Location,Name,Other_Details,Park_Name,Prop_ID,lat,lon
0,Y,,0.8 miles,N,"Enter behind the Salt Marsh Nature Center, loc...",Salt Marsh Nature Trail,<p>The first half of this mile-long trail foll...,Marine Park,B057,,
1,N,Easy,1.0 mile,N,Enter Park at Lincoln Road and Ocean Avenue en...,Lullwater,Explore the Lullwater to see how nature thrive...,Prospect Park,B073,,
2,N,Easy,0.75 miles,N,Enter Park at Lincoln Road and Ocean Avenue en...,Midwood,Step back in time with a walk through Brooklyn...,Prospect Park,B073,,
3,N,Easy,0.5 miles,N,Enter Park at Lincoln Road and Ocean Avenue en...,Peninsula,Discover how the Peninsula has changed over th...,Prospect Park,B073,,
4,N,Easy,0.5 miles,N,Enter Park at Lincoln Road and Ocean Avenue en...,Waterfall,Trace the source of the Lake on the Waterfall ...,Prospect Park,B073,,
5,N,Various,Various,N,"Park-wide. Check out our <a href=""/park-featur...",Alley Pond Trails,Numerous trails wind through native hardwood (...,Alley Pond Park,Q001,,
6,N,,1.7 miles,N,"Forest Park Drive East, off of Woodhaven Boule...",Blue Trail,Forest Park's numerous trails wind through nat...,Forest Park,Q015,,
7,N,,2.4 miles,N,Memorial Drive and Forest Park Drive East,Orange Trail,This trail will lead you through a 2.4 mile ad...,Forest Park,Q015,,
8,N,,1.0 mile,N,Metropolitan Avenue & Forest Park Drive East,Yellow Trail,,Forest Park,Q015,,
9,N,,3.0 miles,N,Francis Lewis Boulevard and Union Turnpike,South Preserve Trail,Numerous unnamed trails wind through native ha...,Cunningham Park,Q021,,


In [58]:
hiking.dtypes

Accessible         object
Difficulty         object
Length             object
Limited_Access     object
Location           object
Name               object
Other_Details      object
Park_Name          object
Prop_ID            object
lat               float64
lon               float64
dtype: object

### Encoding categorical variables


In [0]:
from sklearn.preprocessing import LabelEncoder

In [61]:
# Set up the LabelEncoder object
enc = LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking["Accessible_enc"] = enc.fit_transform(hiking["Accessible"])

# Compare the two columns
print(hiking[["Accessible_enc", "Accessible"]].head())

   Accessible_enc Accessible
0               1          Y
1               0          N
2               0          N
3               0          N
4               0          N


In [64]:
volunteer.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,amsl,amsl_unit,org_title,org_content_id,addresses_count,locality,region,postalcode,primary_loc,display_url,recurrence_type,hours,created_date,last_modified_date,start_date_date,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,4996,37004,50,0,Volunteers Needed For Rise Up & Stay Put! Home...,737,Building on successful events last summer and ...,,,,,,Center For NYC Neighborhoods,4426,1,,NY,,,/opportunities/4996,onetime,0,January 13 2011,June 23 2011,July 30 2011,July 30 2011,approved,,,,,,,,
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,,,Bpeace,37026,1,"5 22nd St\nNew York, NY 10010\n(40.74053152272...",NY,10010.0,,/opportunities/5008,onetime,0,January 14 2011,January 25 2011,February 01 2011,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,,,Street Project,3001,1,,NY,10026.0,,/opportunities/5016,onetime,0,January 19 2011,January 21 2011,January 29 2011,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,,,Oxfam America,2170,1,,NY,2114.0,,/opportunities/5022,ongoing,0,January 21 2011,January 25 2011,February 14 2011,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,,,Office of Recycling Outreach and Education,36773,1,,NY,10455.0,,/opportunities/5055,onetime,0,January 28 2011,February 01 2011,February 05 2011,February 05 2011,approved,,,,,,,,


In [65]:
# Transform the category_desc column
category_enc = pd.get_dummies(volunteer["category_desc"])

# Take a look at the encoded columns
print(category_enc.head())

   Education  ...  Strengthening Communities
0          0  ...                          0
1          0  ...                          1
2          0  ...                          1
3          0  ...                          1
4          0  ...                          0

[5 rows x 6 columns]


In [66]:
# Compare the two columns
print(volunteer["category_desc"].head())

0                          NaN
1    Strengthening Communities
2    Strengthening Communities
3    Strengthening Communities
4                  Environment
Name: category_desc, dtype: object


### Engineering numerical features


In [69]:
import io
S = '''
      name  run1  run2  run3  run4  run5
0      Sue  20.1  18.5  19.6  20.3  18.3
1     Mark  16.5  17.1  16.9  17.6  17.3
2     Sean  23.5  25.1  25.2  24.6  23.9
3     Erin  21.7  21.1  20.9  22.1  22.2
4    Jenny  25.8  27.1  26.1  26.7  26.9
5  Russell  30.9  29.6  31.4  30.4  29.9
'''
running_times_5k = pd.read_table(io.StringIO(S), delim_whitespace=True)
running_times_5k

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,name,run1,run2,run3,run4,run5
0,Sue,20.1,18.5,19.6,20.3,18.3
1,Mark,16.5,17.1,16.9,17.6,17.3
2,Sean,23.5,25.1,25.2,24.6,23.9
3,Erin,21.7,21.1,20.9,22.1,22.2
4,Jenny,25.8,27.1,26.1,26.7,26.9
5,Russell,30.9,29.6,31.4,30.4,29.9


In [70]:
# Create a list of the columns to average
run_columns = ["run1", "run2", "run3", "run4", "run5"]

# Use apply to create a mean column
running_times_5k["mean"] = running_times_5k.apply(lambda row: row[run_columns].mean(), axis=1)

# Take a look at the results
print(running_times_5k)

      name  run1  run2  run3  run4  run5   mean
0      Sue  20.1  18.5  19.6  20.3  18.3  19.36
1     Mark  16.5  17.1  16.9  17.6  17.3  17.08
2     Sean  23.5  25.1  25.2  24.6  23.9  24.46
3     Erin  21.7  21.1  20.9  22.1  22.2  21.60
4    Jenny  25.8  27.1  26.1  26.7  26.9  26.52
5  Russell  30.9  29.6  31.4  30.4  29.9  30.44


In [71]:
# First, convert string column to date column
volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"])

# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer["start_date_converted"].apply(lambda row: row.month)

# Take a look at the converted and new month columns
print(volunteer[["start_date_converted", "start_date_month"]].head())

  start_date_converted  start_date_month
0           2011-07-30                 7
1           2011-02-01                 2
2           2011-01-29                 1
3           2011-02-14                 2
4           2011-02-05                 2


### Text classification


In [0]:
import re

In [0]:
# Write a pattern to extract numbers and decimals
def return_mileage(length):
    pattern = re.compile(r"\d+\.\d+")
    
    # Search the text for matches
    mile = re.match(pattern, length)
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))

In [85]:
hiking["Length"][0]

'0.8 miles'

In [86]:
return_mileage("0.8 miles")

0.8

In [89]:
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking["Length"].apply(lambda row: return_mileage(row))
print(hiking[["Length", "Length_num"]].head())

TypeError: ignored

In [0]:
# match dataset: Subset the volunteer dataset
volunteer = volunteer[volunteer["category_desc"].notnull()]

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

In [0]:
# Take the title text
title_text = volunteer["title"]

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)

In [0]:
nb = GaussianNB()

In [103]:
# Split the dataset according to the class distribution of category_desc
y = volunteer["category_desc"]
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y)

# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))

0.5419354838709678


## 4. Selecting features for modeling

This chapter goes over a few different techniques for selecting the most important features from your dataset. You'll learn how to drop redundant features, work with text vectors, and reduce the number of features in your dataset using principal component analysis (PCA).

In [112]:
volunteer = pd.read_csv("volunteer_opportunities.csv")
print("volunteer dimension: ", volunteer.shape) # (665, 35)
# volunteer.tail()
volunteer = volunteer[volunteer["category_desc"].notnull()]
print("volunteer timmed dimension: ", volunteer.shape) # (617, 35)

volunteer dimension:  (665, 35)
volunteer timmed dimension:  (617, 35)


In [113]:
volunteer.columns

Index(['opportunity_id', 'content_id', 'vol_requests', 'event_time', 'title',
       'hits', 'summary', 'is_priority', 'category_id', 'category_desc',
       'amsl', 'amsl_unit', 'org_title', 'org_content_id', 'addresses_count',
       'locality', 'region', 'postalcode', 'primary_loc', 'display_url',
       'recurrence_type', 'hours', 'created_date', 'last_modified_date',
       'start_date_date', 'end_date_date', 'status', 'Latitude', 'Longitude',
       'Community Board', 'Community Council ', 'Census Tract', 'BIN', 'BBL',
       'NTA'],
      dtype='object')

### Removing redundant features


In [115]:
# Create a list of redundant column names to drop
to_drop = ["category_desc", "created_date", "locality", "region", "vol_requests"]

# Drop those columns from the dataset
volunteer_subset = volunteer.drop(to_drop, axis=1)

# Print out the head of the new dataset
print(volunteer_subset.head())

   opportunity_id  content_id  event_time  ... BIN  BBL NTA
1            5008       37036           0  ... NaN  NaN NaN
2            5016       37143           0  ... NaN  NaN NaN
3            5022       37237           0  ... NaN  NaN NaN
4            5055       37425           0  ... NaN  NaN NaN
5            5056       37426           0  ... NaN  NaN NaN

[5 rows x 30 columns]


In [0]:
# Print out the column correlations of the wine dataset
print(wine.corr())

# Take a minute to find the column where the correlation value is greater than 0.75 at least twice
to_drop = "Flavanoids"

# Drop that column from the DataFrame
wine = wine.drop(to_drop, axis=1)

### Selecting features using text vectors


In [0]:
# dataset not available

In [0]:
# Add in the rest of the parameters
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    
    # Let's transform that zipped dict into a series
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    
    # Let's sort the series to pull out the top n weighted words
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    return [original_vocab[i] for i in zipped_index]

# Print out the weighted words
print(return_weights(vocab, tfidf_vec.vocabulary_, text_tfidf, 8, 3))

In [0]:
def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
    
        # Here we'll call the function from the previous exercise, and extend the list we're creating
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)

# Call the function to get the list of word indices
filtered_words = words_to_filter(vocab, tfidf_vec.vocabulary_, text_tfidf, 3)

# By converting filtered_words back to a list, we can use it to filter the columns in the text vector
filtered_text = text_tfidf[:, list(filtered_words)]

In [0]:
# Split the dataset according to the class distribution of category_desc
train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)

# Fit the model to the training data
nb.fit(train_X, train_y)

# Print out the model's accuracy
print(nb.score(test_X, test_y))

### Dimensionality reduction


In [118]:
wine = pd.read_csv("wine_types.csv")
print("wine df dimension: ", wine.shape) # (178, 14)
wine.tail()

wine df dimension:  (178, 14)


Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.7,0.64,1.74,740
174,3,13.4,3.91,2.48,23.0,102,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840
177,3,14.13,4.1,2.74,24.5,96,2.05,0.76,0.56,1.35,9.2,0.61,1.6,560


In [119]:
wine.shape

(178, 14)

In [120]:
from sklearn.decomposition import PCA

# Set up PCA and the X vector for diminsionality reduction
pca = PCA()
wine_X = wine.drop("Type", axis=1)

# Apply PCA to the wine dataset
transformed_X = pca.fit_transform(wine_X)

# Look at the percentage of variance explained by the different components
print(pca.explained_variance_ratio_)

[9.98091230e-01 1.73591562e-03 9.49589576e-05 5.02173562e-05
 1.23636847e-05 8.46213034e-06 2.80681456e-06 1.52308053e-06
 1.12783044e-06 7.21415811e-07 3.78060267e-07 2.12013755e-07
 8.25392788e-08]


In [122]:
# sanity check
transformed_X.shape # (178, 13)

(178, 13)

In [0]:
# match dataset
y = wine['Type']

In [125]:
# Split the transformed X and the y labels into training and test sets
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(transformed_X, y)

# Fit knn to the training data
knn.fit(X_wine_train, y_wine_train)

# Score knn on the test data and print it out
print(knn.score(X_wine_test, y_wine_test))

0.6666666666666666


## 5. Putting it all together

Now that you've learned all about preprocessing you'll try these techniques out on a dataset that records information on UFO sightings.

### UFOs and preprocessing


In [127]:
ufo = pd.read_csv("ufo_sightings_large.csv")
print("ufo df dimension: ", ufo.shape)
ufo.tail()

ufo df dimension:  (4935, 11)


Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
4930,7/5/2000 19:30,schnecksville,pa,us,oval,5.0,about 5 seconds,On my bike when i saw a shiny silver oval not ...,7/11/2000,40.6677778,-75.6075
4931,3/18/2008 22:00,gibson,ga,us,triangle,25.0,25 seconds,Three sided stationary object turning clockwi...,3/31/2008,33.2333333,-82.595556
4932,6/15/2005 02:30,kent,wa,us,circle,0.0,early morning,Cicle object over Washington state all differe...,10/30/2006,47.3811111,-122.233611
4933,11/1/1991 03:00,niles,mi,us,triangle,7200.0,2 hours,Triangle zigzagged. Another shined light on u...,9/2/2005,41.8297222,-86.254167
4934,12/10/2005 18:00,phoenix,az,us,other,60.0,1 minutes,Close encounter of the third kind.,12/20/2012,33.4483333,-112.073333


In [128]:
# Check the column types
print(ufo.dtypes)

# Change the type of seconds to float
ufo["seconds"] = ufo["seconds"].astype(float)

# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo["date"])

# Check the column types
print(ufo[["seconds", "date"]].dtypes)

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object
seconds           float64
date       datetime64[ns]
dtype: object


In [129]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[["length_of_time", "state", "type"]].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ufo["length_of_time"].notnull() & 
          ufo["state"].notnull() & 
          ufo["type"].notnull()]

# Print out the shape of the new dataset
print(ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


### Categorical variables and standardization


In [0]:
def return_minutes(time_string):
    
    # We'll use \d+ to grab digits and match it to the column values
    pattern = re.compile(r"\d+")
        
    # Use match on the pattern and column
    num = re.match(pattern, time_string)
    if num is not None:
        return int(num.group(0))

In [133]:
ufo["length_of_time"][0] # weird dataset

'2 weeks'

In [134]:
return_minutes("2 weeks")

2

In [131]:
# Apply the extraction to the length_of_time column
ufo["minutes"] = ufo["length_of_time"].apply(return_minutes)

# Take a look at the head of both of the columns
print(ufo[["length_of_time", "minutes"]].head())

TypeError: ignored

In [132]:
# Check the variance of the seconds and minutes columns
print(ufo[["seconds", "minutes"]].var())

# Log normalize the seconds column
ufo["seconds_log"] = np.log(ufo["seconds"])

# Print out the variance of just the seconds_log column
print(ufo["seconds_log"].var())

KeyError: ignored

### Engineering new features


In [135]:
# Use Pandas to encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda val: 1 if val == "us" else 0)

# Print the number of unique type values
print(len(ufo["type"].unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo["type"])

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo, type_set], axis=1)

22


In [136]:
# Look at the first 5 rows of the date column
print(ufo["date"].head())

# Extract the month from the date column
ufo["month"] = ufo["date"].apply(lambda row: row.month)

# Extract the year from the date column
ufo["year"] = ufo["date"].apply(lambda row: row.year)

# Take a look at the head of all three columns
print(ufo[["date", "month", "year"]].head())

0   2011-11-03 19:21:00
1   2004-10-03 19:05:00
2   2009-09-25 21:00:00
3   2002-11-21 05:45:00
4   2010-08-19 12:55:00
Name: date, dtype: datetime64[ns]
                 date  month  year
0 2011-11-03 19:21:00     11  2011
1 2004-10-03 19:05:00     10  2004
2 2009-09-25 21:00:00      9  2009
3 2002-11-21 05:45:00     11  2002
4 2010-08-19 12:55:00      8  2010


In [0]:
# subset to remove np.nan
ufo = ufo[ufo["desc"].notnull()]

In [144]:
# Take a look at the head of the desc field
print(ufo["desc"].head())

# Create the tfidf vectorizer object
vec = TfidfVectorizer()

# Use vec's fit_transform method on the desc field
desc_tfidf = vec.fit_transform(ufo["desc"])

# Look at the number of columns this creates.
print(desc_tfidf.shape)

0    Red blinking objects similar to airplanes or s...
1                 Many fighter jets flying towards UFO
2    Green&#44 red&#44 and blue pulses of light tha...
3    It was a large&#44 triangular shaped flying ob...
4       A white spinning disc in the shape of an oval.
Name: desc, dtype: object
(4932, 6433)


### Feature selection and modeling


In [146]:
ufo.columns

Index(['date', 'city', 'state', 'country', 'type', 'seconds', 'length_of_time',
       'desc', 'recorded', 'lat', 'long', 'country_enc', 'changing', 'chevron',
       'cigar', 'circle', 'cone', 'cross', 'cylinder', 'diamond', 'disk',
       'egg', 'fireball', 'flash', 'formation', 'light', 'other', 'oval',
       'rectangle', 'sphere', 'teardrop', 'triangle', 'unknown', 'month',
       'year'],
      dtype='object')

In [147]:
# Check the correlation between the seconds, seconds_log, and minutes columns
print(ufo[["seconds", "seconds_log", "minutes"]].corr())

# Make a list of features to drop   
to_drop = ["city", "country", "date", "desc", "lat", 
           "length_of_time", "long", "minutes", 
           "recorded", "seconds", "state"]

KeyError: ignored

In [0]:
# Drop those features
ufo_dropped = ufo.drop(to_drop, axis=1)

# Let's also filter some words out of the text vector we created
filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)

Our X dataset has the log-normalized seconds column, the one-hot encoded type columns, as well as the month and year when the sighting took place. The y labels are the encoded country column, where 1 is us and 0 is ca.

In [0]:
# Take a look at the features in the X set of data
print(X.columns)

# Split the X and y sets using train_test_split, setting stratify=y
train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y)

# Fit knn to the training sets
knn.fit(train_X, train_y)

# Print the score of knn on the test sets
print(knn.score(test_X, test_y))

In [0]:
# Use the list of filtered words we created to filter the text vector
filtered_text = desc_tfidf[:, list(filtered_words)]

# Split the X and y sets using train_test_split, setting stratify=y 
train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)

# Fit nb to the training sets
nb.fit(train_X, train_y)

# Print the score of nb on the test sets
print(nb.score(test_X, test_y))