In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

## AirBnB Price Classifier

In [2]:
airbnb_df = pd.read_csv('AB_NYC_2019.csv')
airbnb_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [3]:
airbnb_df = airbnb_df[['id','name','room_type', 'price']]
airbnb_df.head()

Unnamed: 0,id,name,room_type,price
0,2539,Clean & quiet apt home by the park,Private room,149
1,2595,Skylit Midtown Castle,Entire home/apt,225
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,Private room,150
3,3831,Cozy Entire Floor of Brownstone,Entire home/apt,89
4,5022,Entire Apt: Spacious Studio/Loft by central park,Entire home/apt,80


In [4]:
airbnb_df['price'].median()

106.0

In [5]:
airbnb_df['price'].describe()

count    48895.000000
mean       152.720687
std        240.154170
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64

In [6]:
# create a threshold boolean column for whether an AirBnB is expensive
# if df['price']>152.720687, spit true
airbnb_df = airbnb_df.assign(
    is_expensive = airbnb_df.get('price') > 152.720687
)
airbnb_df.head()

Unnamed: 0,id,name,room_type,price,is_expensive
0,2539,Clean & quiet apt home by the park,Private room,149,False
1,2595,Skylit Midtown Castle,Entire home/apt,225,True
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,Private room,150,False
3,3831,Cozy Entire Floor of Brownstone,Entire home/apt,89,False
4,5022,Entire Apt: Spacious Studio/Loft by central park,Entire home/apt,80,False


In [7]:
airbnb_df['is_expensive'].unique()

array([False,  True])

In [8]:
# distribution/histogram

In [9]:
airbnb_df['is_expensive'].value_counts()

False    34016
True     14879
Name: is_expensive, dtype: int64

In [10]:
airbnb_df_X = airbnb_df.drop(columns=['is_expensive'])
airbnb_df_y = airbnb_df['is_expensive']

In [11]:
airbnb_df_X_cat_col = airbnb_df_X.columns

airbnb_X_col_transform = ColumnTransformer([('one-hot', OneHotEncoder(), airbnb_df_X_cat_col)])
airbnb_X = airbnb_X_col_transform.fit_transform(airbnb_df_X)

airbnb_y_label_encoder = LabelEncoder()
airbnb_y = airbnb_y_label_encoder.fit_transform(airbnb_df_y)

## Gold Medal Classifier

In [12]:
olympic_df = pd.read_csv('athlete_events.csv')
olympic_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [13]:
olympic_df = olympic_df[['ID','Name','Sex', 'Age', 
                         'Height','Weight','Team','Sport','Medal']]
olympic_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,Sport,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,Judo,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,Speed Skating,


In [14]:
olympic_df = olympic_df[olympic_df['Height'].notna()]
olympic_df = olympic_df[olympic_df['Weight'].notna()]
olympic_df = olympic_df[olympic_df['Age'].notna()]
olympic_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,Sport,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,Judo,
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,Speed Skating,
5,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,Speed Skating,
6,5,Christine Jacoba Aaftink,F,25.0,185.0,82.0,Netherlands,Speed Skating,


In [15]:
olympic_df['Medal'].unique()

array([nan, 'Bronze', 'Gold', 'Silver'], dtype=object)

In [16]:
olympic_df['Gold'] = olympic_df['Medal']=='Gold'
olympic_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,Sport,Medal,Gold
0,1,A Dijiang,M,24.0,180.0,80.0,China,Basketball,,False
1,2,A Lamusi,M,23.0,170.0,60.0,China,Judo,,False
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,Speed Skating,,False
5,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,Speed Skating,,False
6,5,Christine Jacoba Aaftink,F,25.0,185.0,82.0,Netherlands,Speed Skating,,False


In [17]:
olympic_df['Gold'].value_counts()

False    195998
True      10167
Name: Gold, dtype: int64

In [18]:
olympic_df_X = olympic_df.drop(columns=['Gold'])
olympic_df_y = olympic_df['Gold']

In [19]:
olympic_df_X_cat_col = olympic_df_X.columns

olympic_X_col_transform = ColumnTransformer([('one-hot', OneHotEncoder(), olympic_df_X_cat_col)])
olympic_X = olympic_X_col_transform.fit_transform(olympic_df_X)

olympic_y_label_encoder = LabelEncoder()
olympic_y = olympic_y_label_encoder.fit_transform(olympic_df_y)