In [69]:
import time
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from itertools import product
import warnings
warnings.filterwarnings('ignore')
from tensorflow import keras
from sklearn import datasets
from tensorflow.keras import utils

# Exercise 7.5

In [25]:
def problem7_5():
    #part 1
    RFC = RandomForestClassifier(bootstrap=True, oob_score=True, warm_start=False)

    titanic = pd.read_csv('train.csv')
    titanic.dropna(inplace=True)
    X = titanic.drop(['Cabin', 'Name', 'PassengerId', 'Ticket'], axis=1)
    X = pd.get_dummies(X, columns=['Pclass', 'Sex', 'Embarked'], drop_first=True)
    X = X.dropna()
    y = X['Survived']
    X = X.drop(['Survived'], axis=1)
    print('Part 1:', RFC.fit(X, y).oob_score_, sep='\n')

    #part 2 find the best hyperparameters max_depth and max_features
    max_features_params = ['auto', 'sqrt', 'log2']
    max_depth_params = list(np.arange(35, 60))
    max_features_params += list(np.arange(1, 10))
    #get all possible combinations
    combinations = list(product(max_depth_params, max_features_params))
    best_oob_score = -np.inf
    #iterate through and get best score
    for max_d, max_f in combinations:
        RFC = RandomForestClassifier(max_depth=max_d,
                                     max_features=max_f,
                                     bootstrap=True,
                                     oob_score=True,
                                     warm_start=False)
        curr_oob_score = RFC.fit(X, y).oob_score_
        if curr_oob_score > best_oob_score:
            best_oob_score = curr_oob_score
            best_params = {'max_depth' : max_d, 'max_features' : max_f}

    print()
    print('part 2: ')
    print('best oob score', best_oob_score, sep='\n')
    print('best hyper-parameters', best_params, sep='\n')

    #part 3 build the most optimal random forest classifier
    RFC = RandomForestClassifier(max_depth = best_params['max_depth'],
                                 max_features=best_params['max_features'],
                                 bootstrap=True,
                                 oob_score=True,
                                 warm_start=True)

    #get most important features
    RFC.fit(X, y)
    impurities = RFC.feature_importances_
    #sort indexes by highest to lowest
    min_indexes = np.argsort(impurities)
    max_indexes = np.argsort(impurities)[::-1]

    print()
    print('Part 3:')
    print('most important features:', X.columns[max_indexes[:3]] )
    print('least important features:', X.columns[min_indexes[:3]] )

    return

problem7_5()

Part 1:
0.726775956284153

part 2: 
best oob score
0.7923497267759563
best hyper-parameters
{'max_depth': 38, 'max_features': 6}

Part 3:
most important features: Index(['Age', 'Fare', 'Sex_male'], dtype='object')
least important features: Index(['Embarked_Q', 'Pclass_3', 'Pclass_2'], dtype='object')


# Exercise 7.6

In [71]:
def problem7_6():
    #compltes problem 7.6
    (X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()


    input_dim = 784 #28*28
    X_train = X_train.reshape(60000, input_dim)
    X_test = X_test.reshape(10000, input_dim)
    X_train = X_train/255
    X_test = X_test/255
    y_train = utils.to_categorical(y_train, 10)
    y_test = utils.to_categorical(y_test, 10)

    feature_list = list(np.arange(5, 10))
    depth_list = list(np.arange(20, 25))
    combinations = list(product(feature_list, depth_list))

    best_oob_score = -np.inf
    for feat, dep in combinations:
        RFC = RandomForestClassifier(n_estimators= 200,
                                     max_features=feat,
                                     max_depth=dep,
                                     bootstrap=True,
                                     oob_score=True,
                                     warm_start=True)
        RFC.fit(X_train, y_train)
        curr_score = RFC.oob_score_
        if curr_score > best_oob_score:
            best_oob_score = curr_score
            best_hyper_params = {'max_features':feat, 'max_depth': dep}

    print('Best oob score:', best_oob_score, sep='\n' )
    print('Best hyper parameters: ', best_hyper_params, sep='\n')

    return

problem7_6()

Best oob score:
0.8754333333333333
Best hyper parameters: 
{'max_features': 9, 'max_depth': 23}


# Exercise 7.7

In [51]:
def problem7_7():
    boston = datasets.load_boston()
    X = boston.data

    y = boston.target

    features = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS",
                "RAD", "TAX", "PTRATIO", "B", "LSTAT"]

    X = pd.DataFrame(X, columns=features)



    max_features_params = ['auto', 'sqrt', 'log2']
    max_depth_params = list(np.arange(35, 60))
    max_features_params += list(np.arange(1, 10))
    #get all possible combinations
    combinations = list(product(max_depth_params, max_features_params))
    best_oob_score = -np.inf
    #iterate through and get best score
    for max_d, max_f in combinations:
        RFR = RandomForestRegressor(max_depth=max_d,
                                     max_features=max_f,
                                     bootstrap=True,
                                     oob_score=True,
                                     warm_start=False)
        curr_oob_score = RFR.fit(X, y).oob_score_
        if curr_oob_score > best_oob_score:
            best_oob_score = curr_oob_score
            best_params = {'max_depth' : max_d, 'max_features' : max_f}

    print('part 1: ')
    print('best oob score', best_oob_score, sep='\n')
    print('best hyper-parameters', best_params, sep='\n')

    #part 3 build the most optimal random forest classifier
    RFR = RandomForestRegressor(max_depth = best_params['max_depth'],
                                 max_features=best_params['max_features'],
                                 bootstrap=True,
                                 oob_score=True,
                                 warm_start=True)

    #get most important features
    RFR.fit(X, y)
    impurities = RFR.feature_importances_
    #sort indexes by highest to lowest
    min_indexes = np.argsort(impurities)
    max_indexes = np.argsort(impurities)[::-1]

    print()
    print('Part 2:')
    print('most important features:', X.columns[max_indexes[:3]] )
    print('least important features:', X.columns[min_indexes[:3]] )

    return

problem7_7()

part 1: 
best oob score
0.8925588434244296
best hyper-parameters
{'max_depth': 35, 'max_features': 4}

Part 2:
most important features: Index(['RM', 'LSTAT', 'INDUS'], dtype='object')
least important features: Index(['ZN', 'CHAS', 'RAD'], dtype='object')


# Exercise 7.8

In [70]:
def problem7_8():
    #part 1

    def data_cleaning():
        '''This function cleans the data we will be using
        :return:
        flight_2016: pandas dataframe with the cleaned flight data from 2016
        flight_2017: pandas dataframe with the cleaned fligth data from 2017
        '''
        flight_2016 = pd.read_csv('flight.csv', delimiter=',')
        #drop useless flight data
        flight_2016.drop(['Month', 'Year', 'Day', 'Flight_Date', 'FlightNum',
                      'Departure_Time','Dep_Delay', 'DepDel15', 'Dep_Delay_Groups',
                      'Arrival_Time', 'Arr_Delay_Minutes',
                      'Arr_Del_morethan15', 'Cancelled', 'Diverted',
                      'DistanceGroup', 'UniqueCarrier', 'Carrier_Delay', 'WeatherDelay', 'NAS_Delay',
                      'Security_Delay', 'Late_Aircraft_Delay', 'Top_Carriers', 'Top_Origin',
                      'DEPTIME_GROUP1', 'DEPTIME_GROUP2', 'DEPTIME_GROUP3' , 'Tai_lNum', 'Origin_City_Name', 'Origin_State'], axis=1, inplace=True)

        #change to be rolling departure times
        mask = flight_2016['Scheduled_Departure'] >= 1200
        flight_2016[mask]['Scheduled_Departure'] *= -1
        flight_2016[mask]['Scheduled_Departure'] += 2400

        flight_2017 = pd.read_csv('fl_samp.csv', delimiter=',')
        #drop useless flight data
        flight_2017.drop(['Year', 'Month', 'Day', 'Flight_Date', 'UniqueCarrier', 'Departure_Time',
                      'Scheduled_Arrival', 'Dep_Delay', 'Arr_Del_morethan15', 'DistanceGroup',
                      'Carrier_Delay', 'WeatherDelay', 'NAS_Delay', 'Late_Aircraft_Delay',
                      'DEPTIME_GROUP1', 'DEPTIME_GROUP2', 'DEPTIME_GROUP3' ], axis=1, inplace=True)


        return flight_2016, flight_2017

    flight_2016, _ = data_cleaning()
    flight_2016 = pd.get_dummies(flight_2016, columns=['Origin_Airport'], drop_first=True)

    y = flight_2016['Arrival_Delay']
    X = flight_2016.drop(['Arrival_Delay'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.7,
                                                        random_state=0)

    estimator = list(np.arange(1, 10))
    depth = list(np.arange( 30, 45))
    features = list(np.arange(1, 6))

    combinations = list(product(estimator, depth, features))
    best_oob_score = -np.inf
    for est, dep, feat in combinations:
        RFR = RandomForestClassifier( n_estimators=est,
                                    max_depth=dep,
                                    max_features=feat,
                                    bootstrap=True,
                                    oob_score=True,
                                    warm_start=False)

        curr_oob_score = RFR.fit(X, y).oob_score_
        if curr_oob_score > best_oob_score:
            best_oob_score = curr_oob_score
            best_params = {'n_estimators': est, 'max_depth' : dep, 'max_features' : feat}

    print('part 1:')
    print('Best oob scare', best_oob_score, sep='\n')
    print('Best hyperparameters:', best_params, sep='\n')

    RFC = RandomForestClassifier( n_estimators=best_params['n_estimators'],
                                  max_depth = best_params['max_depth'],
                                  max_features = best_params['max_features'],
                                  bootstrap=True,
                                  oob_score=True,
                                  warm_start=False
                                  )
    RFC.fit(X_train, y_train)
    LinR = LinearRegression()
    LinR.fit(X_train, y_train)
    LR = LogisticRegression()
    LR.fit(X_train, y_train)

    print()
    print('part 2:')
    print('Accuracy of Random Forest Classifier', RFC.score(X_test, y_test), sep='\n')
    print('Accuracy of Logistic Regression', LR.score(X_test, y_test), sep='\n')
    print('Accuracy of Linear Regression', LinR.score(X_test, y_test), sep='\n')

    impurities = RFC.feature_importances_
    #sort indexes by highest to lowest
    min_indexes = np.argsort(impurities)
    max_indexes = np.argsort(impurities)[::-1]

    print()
    print('Part 3:')
    print('most important features:', X.columns[max_indexes[:3]] )
    print('least important features:', X.columns[min_indexes[:3]] )

    return

problem7_8()

part 1:
Best oob scare
0.017423771001866834
Best hyperparameters:
{'n_estimators': 7, 'max_depth': 31, 'max_features': 1}

part 2:
Accuracy of Random Forest Classifier
0.013130615065653075
Accuracy of Logistic Regression
0.013821700069108501
Accuracy of Linear Regression
0.02568467899269289

Part 3:
most important features: Index(['DayOfWeek', 'Scheduled_Arrival', 'Scheduled_Departure'], dtype='object')
least important features: Index(['Origin_Airport_DTW', 'Origin_Airport_EWR', 'Origin_Airport_CLT'], dtype='object')
