# Data Processing

In [144]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [4]:
hiking = pd.read_json("hiking.json")
print(hiking.head())

  Prop_ID                     Name  \
0    B057  Salt Marsh Nature Trail   
1    B073                Lullwater   
2    B073                  Midwood   
3    B073                Peninsula   
4    B073                Waterfall   

                                            Location      Park_Name  \
0  Enter behind the Salt Marsh Nature Center, loc...    Marine Park   
1  Enter Park at Lincoln Road and Ocean Avenue en...  Prospect Park   
2  Enter Park at Lincoln Road and Ocean Avenue en...  Prospect Park   
3  Enter Park at Lincoln Road and Ocean Avenue en...  Prospect Park   
4  Enter Park at Lincoln Road and Ocean Avenue en...  Prospect Park   

       Length Difficulty                                      Other_Details  \
0   0.8 miles       None  The first half of this mile-long trail follows...   
1    1.0 mile       Easy  Explore the Lullwater to see how nature thrive...   
2  0.75 miles       Easy  Step back in time with a walk through Brooklyn...   
3   0.5 miles       Easy  Di

In [5]:
print(hiking.columns)

Index(['Prop_ID', 'Name', 'Location', 'Park_Name', 'Length', 'Difficulty',
       'Other_Details', 'Accessible', 'Limited_Access', 'lat', 'lon'],
      dtype='object')


In [84]:
wine = pd.read_csv("wine.csv")
ufo = pd.read_csv("UFO_sightings_data.csv")
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,11/3/2011 19:21,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,10/3/2004 19:05,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,9/25/2009 21:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875
3,11/21/2002 5:45,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222
4,8/19/2010 12:55,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333


# Missing data - rows

In [58]:
# Check how many values are missing in the category_desc column
print(ufo["state"].isnull().sum())

# Subset the volunteer dataset
# Subset the volunteer dataset
ufo_subset = ufo[ufo["state"].notnull()]

# Print out the shape of the subset
print(ufo_subset.shape)

419
(4516, 11)


In [59]:
# Check data types
ufo.dtypes

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object

# Convert column type

In [62]:
# Print the head of the hits column
print(ufo["lat"].head())

# Convert the hits column to type int
ufo["lat"] = ufo["lat"].astype("string")

# Look at the dtypes of the dataset
print(ufo.dtypes)

0    44.9530556
1    41.4994444
2         45.12
3    36.0213889
4     51.083333
Name: lat, dtype: object
date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                string
long              float64
dtype: object


# Train test split
Use stratified sampling for classification problems
To get similar method as table in R use print(y_train["quality"].value_counts())
to get categorical count of quality column values
Stratefied sampling keeps the disribution of the categorical column

In [7]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [10]:
from sklearn.model_selection import train_test_split

# Create a data with all columns except category_desc
wine_X = wine.drop("quality", axis=1)

# Create a category_desc labels dataset
wine_y = wine[["quality"]]

# Use stratified sampling to split up the dataset according to the volunteer_y dataset
X_train, X_test, y_train, y_test = train_test_split(wine_X, wine_y, stratify=wine_y)

# Print out the category_desc counts on the training y labels
print(y_train["quality"].value_counts())

6    1648
5    1093
7     660
8     131
4     122
3      15
9       4
Name: quality, dtype: int64


# Normalizing the data

In [12]:
# Model without normalizing!
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train, y_train)

# Score the model on the test data
print(knn.score(X_test, y_test))

0.4685714285714286


  import sys


In [14]:
list(wine.columns)

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

# Log normalization

In [15]:
# Print out the variance of the residual sugar column
print(wine['residual sugar'].var())

# Apply the log normalization function to the residual sugar column
wine["sugar_log"] = np.log(wine['residual sugar'])

# Check the variance of the normalized residual sugar column
print(wine["sugar_log"].var())

25.725770164385768
0.8496147220505799


# Scaling data
Fit KNN with scaled data here

In [51]:
# Import StandardScaler from scikit-learn
from sklearn.preprocessing import StandardScaler

# Create the scaler
ss = StandardScaler()

# Take a subset of the DataFrame you want to scale 
wine_subset = wine[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides','quality']]

# Create a data with all columns except category_desc
X = wine_subset.drop("quality", axis=1)

# Create a category_desc labels dataset
y = wine_subset[["quality"]]

# Create the scaling method.
ss = StandardScaler()

# Apply the scaling method to the dataset used for modeling.
X_scaled = ss.fit_transform(X)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

# fit knn 
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the k-nearest neighbors model to the training data.
knn.fit(X_train, y_train)

# Score the model on the test data.
print(knn.score(X_test, y_test))


0.4791836734693877




# Feature engineering Encoding categorical variables
Lets use dummy coding to transfor the Accesible column in the dataframe
One hot encoding when there is more then 2 categories


In [54]:
from sklearn import preprocessing
hiking.head()

Unnamed: 0,Prop_ID,Name,Location,Park_Name,Length,Difficulty,Other_Details,Accessible,Limited_Access,lat,lon
0,B057,Salt Marsh Nature Trail,"Enter behind the Salt Marsh Nature Center, loc...",Marine Park,0.8 miles,,The first half of this mile-long trail follows...,Y,N,,
1,B073,Lullwater,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,1.0 mile,Easy,Explore the Lullwater to see how nature thrive...,N,N,,
2,B073,Midwood,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.75 miles,Easy,Step back in time with a walk through Brooklyn...,N,N,,
3,B073,Peninsula,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Discover how the Peninsula has changed over th...,N,N,,
4,B073,Waterfall,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Trace the source of the Lake on the Waterfall ...,N,N,,


In [57]:
# Set up the LabelEncoder object
enc = preprocessing.LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking["Accessible_enc"] = enc.fit_transform(hiking["Accessible"])

# Compare the two columns
print(hiking[["Accessible_enc", "Accessible"]].head())

   Accessible_enc Accessible
0               1          Y
1               0          N
2               0          N
3               0          N
4               0          N


# One hot encoding of train (Name) column

In [58]:
# Transform the category_desc column
category_enc = pd.get_dummies(hiking["Name"])

# Take a look at the encoded columns
print(category_enc.head())

   Alley Pond Trails  Arden Woods Trail  Bayswater Park Hiking Trails  \
0                  0                  0                             0   
1                  0                  0                             0   
2                  0                  0                             0   
3                  0                  0                             0   
4                  0                  0                             0   

   Blue Trail  Bronx River Forest Trail  Canarsie Park Hiking Trail  \
0           0                         0                           0   
1           0                         0                           0   
2           0                         0                           0   
3           0                         0                           0   
4           0                         0                           0   

   Cass Gallagher Nature Trail  Clove Lakes Park Trail  \
0                            0                       0   
1                 

# Engineering numerical features

In [59]:
running_times_5k = pd.read_csv("rundf.csv")
running_times_5k.head()

Unnamed: 0,name,run1,run2,run3,run4,run5
0,Sue,20.1,18.5,19.6,20.3,18.3
1,Mark,16.5,17.1,16.9,17.6,17.3
2,Sean,23.5,25.1,25.2,24.6,23.9
3,Erin,21.7,21.1,20.9,22.1,22.2
4,Jenny,25.8,27.1,26.1,26.7,26.9


In [61]:
# Create a list of the columns to average
run_columns = ["run1", "run2", "run3", "run4", "run5"]

# Use apply to create a mean column
running_times_5k["mean"] = running_times_5k.apply(lambda row: row[run_columns].mean(), axis=1)

# Take a look at the results
print(running_times_5k)

    name    run1  run2  run3  run4  run5   mean
0      Sue  20.1  18.5  19.6  20.3  18.3  19.36
1     Mark  16.5  17.1  16.9  17.6  17.3  17.08
2     Sean  23.5  25.1  25.2  24.6  23.9  24.46
3     Erin  21.7  21.1  20.9  22.1  22.2  21.60
4    Jenny  25.8  27.1  26.1  26.7  26.9  26.52
5  Russell  30.9  29.6  31.4  30.4  29.9  30.44


# Engineering features dates

In [62]:
ufo = pd.read_csv("UFO_sightings_data.csv")
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,11/3/2011 19:21,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,10/3/2004 19:05,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,9/25/2009 21:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875
3,11/21/2002 5:45,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222
4,8/19/2010 12:55,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333


In [63]:
# First, convert string column to date column
ufo["date_converted"] = pd.to_datetime(ufo["date"])

# Extract just the month from the converted column
ufo["start_date_month"] = ufo["date_converted"].apply(lambda row: row.month)

# Take a look at the converted and new month columns
print(ufo[["date_converted", "start_date_month"]].head())

       date_converted  start_date_month
0 2011-11-03 19:21:00                11
1 2004-10-03 19:05:00                10
2 2009-09-25 21:00:00                 9
3 2002-11-21 05:45:00                11
4 2010-08-19 12:55:00                 8


# Feature engineering from strings
Manipulate Length column in hiking dataframe

In [11]:
hiking.head()

Unnamed: 0,Prop_ID,Name,Location,Park_Name,Length,Difficulty,Other_Details,Accessible,Limited_Access,lat,lon
0,B057,Salt Marsh Nature Trail,"Enter behind the Salt Marsh Nature Center, loc...",Marine Park,0.8 miles,,The first half of this mile-long trail follows...,Y,N,,
1,B073,Lullwater,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,1.0 mile,Easy,Explore the Lullwater to see how nature thrive...,N,N,,
2,B073,Midwood,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.75 miles,Easy,Step back in time with a walk through Brooklyn...,N,N,,
3,B073,Peninsula,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Discover how the Peninsula has changed over th...,N,N,,
4,B073,Waterfall,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Trace the source of the Lake on the Waterfall ...,N,N,,


In [12]:
hiking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Prop_ID         37 non-null     object 
 1   Name            37 non-null     object 
 2   Location        37 non-null     object 
 3   Park_Name       37 non-null     object 
 4   Length          31 non-null     object 
 5   Difficulty      31 non-null     object 
 6   Other_Details   34 non-null     object 
 7   Accessible      37 non-null     object 
 8   Limited_Access  37 non-null     object 
 9   lat             0 non-null      float64
 10  lon             0 non-null      float64
dtypes: float64(2), object(9)
memory usage: 3.3+ KB


In [15]:
# Make sure column is manipulateable string
hiking['Length'] = hiking['Length'].astype(str)
hiking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Prop_ID         37 non-null     object 
 1   Name            37 non-null     object 
 2   Location        37 non-null     object 
 3   Park_Name       37 non-null     object 
 4   Length          37 non-null     object 
 5   Difficulty      31 non-null     object 
 6   Other_Details   34 non-null     object 
 7   Accessible      37 non-null     object 
 8   Limited_Access  37 non-null     object 
 9   lat             0 non-null      float64
 10  lon             0 non-null      float64
dtypes: float64(2), object(9)
memory usage: 3.3+ KB


In [16]:
import re

# Use length 
# Write a pattern to extract numbers and decimals
def return_mileage(length):
    pattern = re.compile(r"\d+\.\d+")
    
    # Search the text for matches
    mile = re.match(pattern, length)
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking["Length"].apply(lambda row: return_mileage(row))
print(hiking[["Length", "Length_num"]].head())

       Length  Length_num
0   0.8 miles        0.80
1    1.0 mile        1.00
2  0.75 miles        0.75
3   0.5 miles        0.50
4   0.5 miles        0.50


# Term frequency inverse document frequency (tfidf)

In [56]:
volunteer = pd.read_csv("volunteering.csv")
volunteer.head()


Unnamed: 0,Requirements,opportunity_id,content_id:,Summary,Category Description,Title,display_url,Recurrence_type,Street Address,2nd Address,...,Website,Borough,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,<p>Volunteers must be 18 years of age or older...,Adult Education and Job Readiness Projects wit...,<p>Planning for a better future starts by beli...,Planning for a better future starts by believi...,,New York Cares,/opportunities/69,ongoing,"65 Broadway, 19th Floor",,...,http://www.newyorkcares.org,MANHATTAN,40.707113,-74.012234,101.0,1.0,13.0,1000809.0,1000210000.0,Battery Park City-Lower Manhattan
1,<p>There will be one day of required volunteer...,Arts and Crafts Volunteers,<p>Arts and Crafts volunteers are needed to as...,Arts and Crafts volunteers are needed to assis...,Communities and Neighbors,Project HAPPY,/opportunities/103,ongoing,425 East 25th Street,,...,http://www.project-happy.org,MANHATTAN,40.737878,-73.97688,106.0,4.0,62.0,1069579.0,1009620000.0,Murray Hill-Kips Bay
2,<p>Previous experience working in an administr...,After-School Program Coordinator,<p>• Provides oral and written reports to the ...,Assist Executive Director in managing after-sc...,Education,Rise Up & Walk Youth Outreach Center Inc,/opportunities/163,ongoing,432 Monroe Street,,...,http://www.riseupnwalk.org,BROOKLYN,40.686592,-73.940747,303.0,36.0,277.0,3051540.0,3018210000.0,Stuyvesant Heights
3,<p>Volunteers must be 18 years of age or older...,Adults and Children with Special Needs Project...,<p>New York Cares offers many programs serving...,New York Cares offers many programs serving ad...,,New York Cares,/opportunities/206,ongoing,"65 Broadway, 19th Floor",,...,http://www.newyorkcares.org,MANHATTAN,40.707113,-74.012234,101.0,1.0,13.0,1000809.0,1000210000.0,Battery Park City-Lower Manhattan
4,<p>Volunteers must be 18 years of age or older...,Animal Care Projects with New York Cares,<p>Socialize with rescued animals and help the...,Socialize with rescued animals and help them g...,,New York Cares,/opportunities/207,ongoing,"65 Broadway, 19th Floor",,...,http://www.newyorkcares.org,MANHATTAN,40.707113,-74.012234,101.0,1.0,13.0,1000809.0,1000210000.0,Battery Park City-Lower Manhattan


In [57]:
print(volunteer["Title"].isnull().sum()) # Just curious

17


In [58]:
volunteer.dtypes

Requirements             object
opportunity_id           object
content_id:              object
Summary                  object
Category Description     object
Title                    object
display_url              object
Recurrence_type          object
Street Address           object
2nd Address              object
City                     object
State                    object
Postcode                 object
Country                  object
Website                  object
Borough                  object
Latitude                float64
Longitude               float64
Community Board         float64
Council District        float64
Census Tract            float64
BIN                     float64
BBL                     float64
NTA                      object
dtype: object

In [66]:
# Removing na values for analysis for both of the columns
volunteer2 = volunteer.dropna(subset =['Title','Category Description'])
volunteer2.shape # there were about 20 missing values

(89, 24)

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Take the title text
title_text = volunteer2['Title']

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)

In [78]:

# Split the dataset according to the class distribution of category_desc
y = volunteer2["Category Description"]

print(y)


1      Communities and Neighbors
2                      Education
6          Health and Well-Being
18     Communities and Neighbors
23         Health and Well-Being
                 ...            
197                    Education
198                    Education
199    Communities and Neighbors
200        Health and Well-Being
201                    Education
Name: Category Description, Length: 89, dtype: object


In [81]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

# Split the dataset according to the class distribution of category_desc
y = volunteer2["Category Description"]

X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y) # normally would have Stratify = y for classification problems

# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))

0.391304347826087


# Feature selection removing redundant features

In [85]:
# Print out the column correlations of the wine dataset
print(wine.corr())


                      fixed acidity  volatile acidity  citric acid  \
fixed acidity              1.000000         -0.022697     0.289181   
volatile acidity          -0.022697          1.000000    -0.149472   
citric acid                0.289181         -0.149472     1.000000   
residual sugar             0.089021          0.064286     0.094212   
chlorides                  0.023086          0.070512     0.114364   
free sulfur dioxide       -0.049396         -0.097012     0.094077   
total sulfur dioxide       0.091070          0.089261     0.121131   
density                    0.265331          0.027114     0.149503   
pH                        -0.425858         -0.031915    -0.163748   
sulphates                 -0.017143         -0.035728     0.062331   
alcohol                   -0.120881          0.067718    -0.075729   
quality                   -0.113663         -0.194723    -0.009209   

                      residual sugar  chlorides  free sulfur dioxide  \
fixed acidity    

In [86]:
# Take a minute to find the column where the correlation value is greater than 0.75 at least twice
to_drop = "residual sugar"

# Drop that column from the DataFrame
wine = wine.drop(to_drop, axis=1)

# Dimensionality Reduction

In [101]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [106]:
from sklearn.decomposition import PCA

# Set up PCA and the X vector for diminsionality reduction
pca = PCA()

# Apply PCA to the wine dataset
transformed_X = pca.fit_transform(wine)

wine_X = wine.drop("quality", axis=1)
y = wine['quality']

# Look at the percentage of variance explained by the different components
print(pca.explained_variance_ratio_)

[9.18440481e-01 8.02708186e-02 6.84959496e-04 3.27258151e-04
 2.51026207e-04 9.12108069e-06 6.72795564e-06 5.45484376e-06
 3.95923461e-06 1.91651264e-07 1.31778909e-09]


In [107]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)

# Split the transformed X and the y labels into training and test sets
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(transformed_X, y)

# Fit knn to the training data
knn.fit(X_wine_train, y_wine_train)

# Score knn on the test data and print it out
print(knn.score(X_wine_test, y_wine_test))

0.5738775510204082


# UFO's and Preprocessing
Checking column types

In [109]:
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,11/3/2011 19:21,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,10/3/2004 19:05,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,9/25/2009 21:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875
3,11/21/2002 5:45,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222
4,8/19/2010 12:55,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333


In [110]:
# Check the column types
print(ufo.dtypes)

# Change the type of seconds to float
ufo["seconds"] = ufo["seconds"].astype(float)

# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo["date"])

# Check the column types
print(ufo[["seconds", "date"]].dtypes)

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object
seconds           float64
date       datetime64[ns]
dtype: object


In [111]:
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,2009-09-25 21:00:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875
3,2002-11-21 05:45:00,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222
4,2010-08-19 12:55:00,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333


# Dropping missing data

In [112]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[["length_of_time", "state", "type"]].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ufo["length_of_time"].notnull() & 
          ufo["state"].notnull() & 
          ufo["type"].notnull()]

# Print out the shape of the new dataset
print(ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


# Extracting numbers from strings!

In [131]:
# Reassign new dataframe
ufo2 = ufo_no_missing

In [117]:
import re

def return_minutes(time_string):
    
    # We'll use \d+ to grab digits and match it to the column values
    pattern = re.compile(r"\d+")
        
    # Use match on the pattern and column
    num = re.match(pattern, time_string)
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo2["minutes"] = ufo2["length_of_time"].apply(return_minutes)

# Take a look at the head of both of the columns
print(ufo2[["length_of_time", "minutes"]].head())

    length_of_time  minutes
0          2 weeks      2.0
1           30sec.     30.0
3  about 5 minutes      NaN
4                2      2.0
5       10 minutes     10.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Identifying features for standardization
After taking a look at the variances of the seconds and minutes column, you'll see that the variance of the seconds column is extremely high. Because seconds and minutes are related to each other (an issue we'll deal with when we select features for modeling), let's log normlize the seconds column.

In [119]:
# Check the variance of the seconds and minutes columns
print(ufo2[["seconds", "minutes"]].var())

# Log normalize the seconds column
ufo2["seconds_log"] = np.log(ufo2["seconds"])

# Print out the variance of just the seconds_log column
print(ufo2["seconds_log"].var())

seconds    1.545212e+10
minutes    9.466250e+02
dtype: float64
nan


  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


# Encoding categorical variables

In [148]:
# Use Pandas to encode us values as 1 and others as 0
ufo2["country_enc"] = ufo2["country"].apply(lambda val: 1 if val == "us" else 0)

# Print the number of unique type values
print(len(ufo2["type"].unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo2["type"])

# Concatenate this set back to the ufo DataFrame
ufo2 = pd.concat([ufo2, type_set], axis=1)

21


In [150]:
ufo2.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,...,flash,formation,light,other,oval,rectangle,sphere,teardrop,triangle,unknown
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,...,0,0,0,0,0,0,0,0,0,1
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,...,0,0,0,0,0,0,0,0,0,0
3,2002-11-21 05:45:00,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,...,0,0,0,0,0,0,0,0,1,0
4,2010-08-19 12:55:00,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,...,0,0,0,0,1,0,0,0,0,0
5,2012-06-16 23:00:00,san diego,ca,us,light,600.0,10 minutes,Dancing lights that would fly around and then ...,7/4/2012,32.7152778,...,0,0,1,0,0,0,0,0,0,0


# Features from dates

In [151]:
# Look at the first 5 rows of the date column
print(ufo2["date"].head())

# Extract the month from the date column
ufo2["month"] = ufo2["date"].apply(lambda row: row.month)

# Extract the year from the date column
ufo2["year"] = ufo2["date"].apply(lambda row: row.year)

# Take a look at the head of all three columns
print(ufo2[["date", "month", "year"]].head())

0   2011-11-03 19:21:00
1   2004-10-03 19:05:00
3   2002-11-21 05:45:00
4   2010-08-19 12:55:00
5   2012-06-16 23:00:00
Name: date, dtype: datetime64[ns]
                 date  month  year
0 2011-11-03 19:21:00     11  2011
1 2004-10-03 19:05:00     10  2004
3 2002-11-21 05:45:00     11  2002
4 2010-08-19 12:55:00      8  2010
5 2012-06-16 23:00:00      6  2012


# Text vectorization

In [153]:
# Take a look at the head of the desc field
print(ufo2["desc"].head())

# Create the tfidf vectorizer object
vec = TfidfVectorizer()

# Use vec's fit_transform method on the desc field
desc_tfidf = vec.fit_transform(ufo2["desc"])

# Look at the number of columns this creates.
print(desc_tfidf.shape)

0    Red blinking objects similar to airplanes or s...
1                 Many fighter jets flying towards UFO
3    It was a large&#44 triangular shaped flying ob...
4       A white spinning disc in the shape of an oval.
5    Dancing lights that would fly around and then ...
Name: desc, dtype: object
(4283, 5754)


# Selecting ideal dataset

In [164]:
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    
    # Let's transform that zipped dict into a series
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    
    # Let's sort the series to pull out the top n weighted words
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    return [original_vocab[i] for i in zipped_index]

In [165]:
def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
    
        # Here we'll call the function from the previous exercise, and extend the list we're creating
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)

In [178]:
# Check the correlation between the seconds, seconds_log, and minutes columns
print(ufo2[["seconds", "seconds_log", "minutes"]].corr())

# Make a list of features to drop   
to_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"]

# Drop those features
ufo_dropped = ufo2.drop(to_drop, axis=1)

# Let's also filter some words out of the text vector we created
filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)

              seconds  seconds_log   minutes
seconds      1.000000     0.174331 -0.009922
seconds_log  0.174331     1.000000  0.111460
minutes     -0.009922     0.111460  1.000000


TypeError: 'dict' object is not callable

In [179]:
ufo_dropped.dtypes

type            object
seconds_log    float64
country_enc      int64
changing         uint8
chevron          uint8
cigar            uint8
circle           uint8
cone             uint8
cross            uint8
cylinder         uint8
diamond          uint8
disk             uint8
egg              uint8
fireball         uint8
flash            uint8
formation        uint8
light            uint8
other            uint8
oval             uint8
rectangle        uint8
sphere           uint8
teardrop         uint8
triangle         uint8
unknown          uint8
month            int64
year             int64
dtype: object

In [182]:
ufo3 = ufo_dropped
ufo3.head()

Unnamed: 0,type,seconds_log,country_enc,changing,chevron,cigar,circle,cone,cross,cylinder,...,light,other,oval,rectangle,sphere,teardrop,triangle,unknown,month,year
0,unknown,14.0058,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,11,2011
1,circle,3.401197,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,10,2004
3,triangle,5.703782,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,11,2002
4,oval,-inf,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,8,2010
5,light,6.39693,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,6,2012


In [232]:
drop = ['type','seconds_log']
ufo3 = ufo3.drop(drop, axis = 1)
ufo3.dtypes

country_enc    int32
changing       int32
chevron        int32
cigar          int32
circle         int32
cone           int32
cross          int32
cylinder       int32
diamond        int32
disk           int32
egg            int32
fireball       int32
flash          int32
formation      int32
light          int32
other          int32
oval           int32
rectangle      int32
sphere         int32
teardrop       int32
triangle       int32
unknown        int32
month          int32
year           int32
dtype: object

In [233]:
ufo3[['country_enc','changing','chevron','cigar','circle','cone','cross','cylinder','diamond','disk','egg','fireball','flash','formation','light','other','oval','rectangle','sphere','teardrop','triangle','unknown','month','year']] = ufo3[['country_enc','changing','chevron','cigar','circle','cone','cross','cylinder','diamond','disk','egg','fireball','flash','formation','light','other','oval','rectangle','sphere','teardrop','triangle','unknown','month','year']].astype(int)
ufo3.dtypes


country_enc    int32
changing       int32
chevron        int32
cigar          int32
circle         int32
cone           int32
cross          int32
cylinder       int32
diamond        int32
disk           int32
egg            int32
fireball       int32
flash          int32
formation      int32
light          int32
other          int32
oval           int32
rectangle      int32
sphere         int32
teardrop       int32
triangle       int32
unknown        int32
month          int32
year           int32
dtype: object

In [234]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo3[['country_enc','changing','chevron','cigar','circle','cone','cross','cylinder','diamond','disk','egg','fireball','flash','formation','light','other','oval','rectangle','sphere','teardrop','triangle','unknown','month','year']].isnull().sum())

country_enc    0
changing       0
chevron        0
cigar          0
circle         0
cone           0
cross          0
cylinder       0
diamond        0
disk           0
egg            0
fireball       0
flash          0
formation      0
light          0
other          0
oval           0
rectangle      0
sphere         0
teardrop       0
triangle       0
unknown        0
month          0
year           0
dtype: int64


In [207]:
# Removing na values for analysis for both of the columns
ufo3 = ufo3.dropna(subset =['country_enc','changing','chevron','cigar','circle','cone','cross','cylinder','diamond','disk','egg','fireball','flash','formation','light','other','oval','rectangle','sphere','teardrop','triangle','unknown','month','year'])
ufo3.shape # there were about 20 missing values

(4283, 25)

In [175]:
from sklearn import preprocessing

In [235]:
# Create a data with all columns except category_de
X = ufo3.drop("country_enc", axis=1)

# Create a category_desc labels dataset
y = ufo3[["country_enc"]]

# Split the X and y sets using train_test_split, setting stratify=y
train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y)


In [236]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)


# Fit knn to the training sets
knn.fit(train_X, train_y)

# Print the score of knn on the test sets
print(knn.score(test_X, test_y))

0.8375350140056023
