# Data Science Nodes

In [1]:
catalog.list()

['companies',
 'reviews',
 'shuttles',
 'my_dataset',
 'preprocessed_companies',
 'preprocessed_shuttles',
 'master_table',
 'parameters']

In [2]:
master_table = catalog.load('master_table')

2021-11-11 22:16:20,197 - kedro.io.data_catalog - INFO - Loading data from `master_table` (CSVDataSet)...


In [3]:
master_table.head()

Unnamed: 0,id_x,shuttle_location,shuttle_type,engine_type,engine_vendor,engines,passenger_capacity,cancellation_policy,crew,d_check_complete,...,review_scores_crew,review_scores_location,review_scores_price,number_of_reviews,reviews_per_month,id_y,company_rating,company_location,total_fleet_count,iata_approved
0,63561,Niue,Type V5,Quantum,ThetaBase Services,1.0,2,strict,1.0,False,...,10.0,9.0,10.0,133,1.65,35029,1.0,Niue,4.0,False
1,53260,Niue,Type V5,Quantum,"Banks, Wood and Phillips",1.0,2,strict,1.0,False,...,10.0,9.0,10.0,37,0.48,35029,1.0,Niue,4.0,False
2,51019,Niue,Type V5,Quantum,ThetaBase Services,1.0,2,flexible,1.0,False,...,10.0,9.0,9.0,10,0.15,35029,1.0,Niue,4.0,False
3,53898,Niue,Type V5,Plasma,ThetaBase Services,3.0,5,strict,3.0,False,...,10.0,9.0,10.0,11,0.21,35029,1.0,Niue,4.0,False
4,36260,Anguilla,Type V5,Quantum,ThetaBase Services,1.0,2,strict,1.0,True,...,9.0,9.0,9.0,3,0.09,30292,0.67,Anguilla,6.0,False


In [4]:
feature_cols = ['engines',
 'passenger_capacity',
 'crew',
 'd_check_complete',
 'moon_clearance_complete',
 'iata_approved',
 'company_rating',
 'review_scores_rating']

In [5]:
master_table[feature_cols]

Unnamed: 0,engines,passenger_capacity,crew,d_check_complete,moon_clearance_complete,iata_approved,company_rating,review_scores_rating
0,1.0,2,1.0,False,False,False,1.00,97.0
1,1.0,2,1.0,False,False,False,1.00,98.0
2,1.0,2,1.0,False,False,False,1.00,92.0
3,3.0,5,3.0,False,False,False,1.00,98.0
4,1.0,2,1.0,True,False,False,0.67,90.0
...,...,...,...,...,...,...,...,...
29763,1.0,2,1.0,False,False,True,0.70,100.0
29764,1.0,2,1.0,False,False,True,1.00,100.0
29765,1.0,1,3.0,False,False,False,1.00,100.0
29766,1.0,2,1.0,False,False,False,1.00,100.0


In [16]:
import pandas as pd
from typing import Dict, Tuple
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import logging

In [13]:
parameters = {'features': feature_cols, 'test_size': 0.2, 'random_state': 42}

In [14]:
def split_data(data: pd.DataFrame, parameters: Dict)-> Tuple:
    '''Splits the data into feature and target training and test sets'''
    
    X = data[parameters['features']]
    y = data['price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=parameters['test_size'], random_state=parameters['random_state'])
    
    return X_train, X_test, y_train, y_test


In [19]:
X_train, X_test, y_train, y_test = split_data(master_table, parameters)

In [15]:
def train_model(X_train: pd.DataFrame, y_train: pd.Series)-> LinearRegression:
    '''Train the linear regression model'''
    regressor = LinearRegression()
    return regressor.fit(X_train, y_train)

In [21]:
regressor = train_model(X_train, y_train)
regressor

LinearRegression()

In [23]:
regressor.coef_

array([ 3.60364953e+03,  3.05065204e+03, -8.74222216e+02, -2.34421626e+02,
       -9.09494702e-13, -5.18679591e+02, -5.73049073e+02,  5.42163972e+00])

In [18]:
def evaluate_model(regressor: LinearRegression, X_test: pd.DataFrame, y_test: pd.DataFrame):
    '''Calculates and logs the coefficient of determination'''
    y_pred = regressor.predict(X_test)
    score = r2_score(y_test, y_pred)
    logger = logging.getLogger()
    logger.info("Model has a coefficient R^2 of %.3f on test_data", score)

In [24]:
evaluate_model(regressor, X_test, y_test)

2021-11-11 22:34:40,929 - root - INFO - Model has a coefficient R^2 of 0.194 on test_data
