In [10]:
import sys

import pandas as pd
import numpy as np
import datetime  as dt

import scipy.stats as stats

from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split

%matplotlib inline

from model import Model

In [2]:
def remove_NaN_rows(dataset):
    
    return(dataset.notna())
    

In [3]:
def append_validation_to_training(training_dataset, validation_dataset):
     
    return(training_dataset.append(validation_dataset).reset_index(drop=True))

In [4]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [5]:
def drop_feature(dataset, feature):
    
    return(dataset.drop([feature], axis = 1))

In [6]:
def to_standardize(dataset):
    scaler = preprocessing.StandardScaler()
    dataset[['cargo_value', 'valid_miles','weight']] = \
        dataset.fit_transform(dataset[['cargo_value', 'valid_miles', 'weight']])
   
    return(dataset)
    

In [38]:
def data_preprocessing(dataset):
        
    #pickup_date parsing and creating 2 new features: month and isMorning
    dataset['month'] = pd.DatetimeIndex(dataset['pickup_date']).month_name()
    print(dataset.columns)
    dataset['time'] = pd.to_datetime(dataset['pickup_date']).dt.strftime("%p")
    dataset.loc[dataset['time'] == "AM", 'isMorning'] = 0
    dataset.loc[dataset['time'] == "PM", 'isMorning'] = 1
    
    #create 3d new feature represents how valuable a given cargo
    dataset['cargo_value'] =  ( dataset['rate'] * dataset['valid_miles'] )/ dataset['weight']
    
    #one-hot encoding for categorical features
    features_to_encode = ['transport_type', 'origin_kma', 'destination_kma', 'month']
    for feature in features_to_encode:
        dataset = encode_and_bind(dataset, feature)
        
    #remove useless features
    features_to_drop = ['time', 'pickup_date']
    for feature in features_to_drop:
        dataset = drop_feature(dataset, feature)
        
    #standardization
    dataset = to_standardize(dataset)
    
    return(dataset)
        

In [39]:
def training_and_validation_sets_preparation(training_path, validation_path):
    training_dataset = pd.read_csv(training_path)
    validation_dataset = pd.read_csv(validation_path)
    
    training_dataset = remove_NaN_rows(training_dataset)
    validation_dataset = remove_NaN_rows(validation_dataset)
    
    #variable to store a training dataset size before merging it with validation dataset
    training_size = training_dataset.shape[0]
    
    training_dataset = append_validation_to_training(training_dataset, validation_dataset)
    
    training_dataset = data_preprocessing(training_dataset)
    
    #store prepocessed training set and validation set
    training_dataset.iloc[:training_size,:].to_csv('dataset/preparedTrainingSet.csv', index=False)
    training_dataset.iloc[training_size:,:].to_csv('dataset/preparedValidationSet.csv', index=False)

In [40]:
def accuracy(real_rates, predicted_rates):
    return np.average(abs(predicted_rates / real_rates - 1.0)) * 100.0

In [41]:
def train_and_validate():
    
    training_dataset_path = 'dataset/train.csv'
    validation_dataset_path = 'dataset/validation.csv'
    
    print("training and validation sets were read!")
    
    try:
        training_and_validation_sets_preparation(training_dataset_path, validation_dataset_path)
    except OSError as err:
        print("OS error: {0}".format(err))
    except:
        print("Unexpected error:", sys.exc_info()[0])
        raise
    print("training and validation sets were prepared!")
    
    
    
    df = pd.read_csv('dataset/preparedTrainingSet.csv')
    
    Y = df[['rate']]
    X = drop_feature(df, 'rate')
    
    regressor = RandomForestRegressor(n_estimators = 100, random_state = 0) 
    regressor.fit(X.values, Y.values.ravel())

    df_val = pd.read_csv('dataset/preparedValidationSet.csv')
    
    Y_val = df_val[['rate']]
    X_val = drop_feature(df_val, 'rate')
    predicted_rates = regressor.predict(X_val.values)
    
    predicted_rates = model.predict(df)
    mare = accuracy(Y_val.values.ravel(), predicted_rates)
    mare = np.round(mare, 2)
    return mare

In [42]:
def generate_final_solution():
    # combine train and validation to improve final predictions
    df = pd.read_csv('dataset/train.csv')
    df_val = pd.read_csv('dataset/validation.csv')
    df = df.append(df_val).reset_index(drop=True)

    model = Model()
    model.fit(df, df.rate)

    # generate and save test predictions
    df_test = pd.read_csv('dataset/test.csv')
    df_test['predicted_rate'] = model.predict(df_test)
    df_test.to_csv('dataset/predicted.csv', index=False)


In [43]:
if __name__ == "__main__":
    mare = train_and_validate()
    print("Accuracy of validatTion is %s" %(mare))

    if mare < 13:  # try to reach 13% or less for validation
        generate_final_solution()
        print("'predicted.csv' is generated, please send it to us")

training and validation sets were read!
Index([u'rate', u'valid_miles', u'transport_type', u'weight', u'pickup_date',
       u'origin_kma', u'destination_kma', u'month'],
      dtype='object')
('Unexpected error:', <type 'exceptions.TypeError'>)


TypeError: <type 'bool'> is not convertible to datetime

In [None]:
trainDataSet = pd.read_csv('dataset/train.csv')

In [None]:
validationDataSet = pd.read_csv('dataset/validation.csv')

In [None]:
trainingSize = trainDataSet.shape[0]
validationSize = validationDataSet.shape[0]

In [None]:
trainDataSet = trainDataSet.append(validationDataSet).reset_index(drop=True)
trainDataSet.shape

In [None]:
trainDataSet.shape[0]-validationSize

In [None]:
trainDataSet.head(5)

In [None]:
trainDataSet.shape

In [None]:
trainDataSet.index

In [None]:
trainDataSet.isnull().sum()

In [None]:
trainDataSet = trainDataSet[trainDataSet['weight'].notna()]

# Data pre-processing

## lable encoding

In [None]:
# lable encoding for transport_type column ['VAN', 'REEFER', 'FLATBED'] => [2, 1, 0]
#transportTypeEncoder = preprocessing.LabelEncoder()
#transportTypeEncoder.fit(list(dataSet['transport_type'].unique()))

# lable encoding for origin_kma column
#originKmaEncoder = preprocessing.LabelEncoder()
#originKmaEncoder.fit(list(dataSet['origin_kma'].unique()))

# lable encoding for destination_kma column
#destinationKmaEncoder = preprocessing.LabelEncoder()
#destinationKmaEncoder.fit(list(dataSet['destination_kma'].unique()))

#dataSet['encoded_transport_type'] = transportTypeEncoder.transform(dataSet['transport_type'])
#dataSet['encoded_origin_kma'] = originKmaEncoder.transform(dataSet['origin_kma'])
#dataSet['encoded_destination_kma'] = originKmaEncoder.transform(dataSet['destination_kma'])
trainDataSet['month'] = pd.DatetimeIndex(trainDataSet['pickup_date']).month_name()
trainDataSet['time'] = pd.to_datetime(trainDataSet['pickup_date']).dt.strftime("%p")
trainDataSet.loc[trainDataSet['time'] == "AM", 'isMorning'] = 0
trainDataSet.loc[trainDataSet['time'] == "PM", 'isMorning'] = 1
trainDataSet['cargo_value'] =  ( trainDataSet['rate'] * trainDataSet['valid_miles'] )/ trainDataSet['weight']

In [None]:
features_to_encode = ['transport_type', 'origin_kma', 'destination_kma', 'month']
for feature in features_to_encode:
    trainDataSet = encode_and_bind(trainDataSet, feature)

In [None]:
trainDataSet = trainDataSet.drop(['time'], axis = 1)
trainDataSet = trainDataSet.drop(['pickup_date'], axis = 1)
trainDataSet.head()


In [None]:
finalDataSet = trainDataSet

## normalization standardization

In [None]:
stats.normaltest(finalDataSet.iloc[:,0].values.flatten())

In [None]:
rateDf = finalDataSet[['rate']]
rateDf = rateDf.round({'rate':2})
rateDf.hist(bins = 100, range=[0, 10])

In [None]:
cargo = finalDataSet[['cargo_value']]
cargo = cargo.round({'cargo_value':2})
cargo.hist(bins = 100, range=[0, 0.3])

In [None]:
miles = finalDataSet[['valid_miles']]
miles = miles.round({'valid_miles':2})
miles.hist(bins = 100, range = [0, 1700])

In [None]:
weight = finalDataSet[['weight']]
weight = weight.round({'weight':2})
weight.hist(bins = 100, range=[5000, 45000])

In [None]:
scaler = preprocessing.StandardScaler()
finalDataSet[['cargo_value', 'valid_miles','weight']] = scaler.fit_transform(finalDataSet[['cargo_value', 'valid_miles', 'weight']])
finalDataSet.head()

In [None]:
finalDataSet.iloc[trainingSize-80:,:].to_csv('dataset/preparedValidationSet.csv', index=False)

In [None]:
Y = finalDataSet[['rate']]
X = finalDataSet.drop(['rate'], axis = 1)

In [None]:
cargo = X[['cargo_value']]
cargo = cargo.round({'cargo_value':2})
cargo.hist(bins = 60, range=[-2, 2])

In [None]:
x = X.values
y = Y.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [None]:
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_train, y_train)

In [None]:
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0) 
regressor.fit(X_train, y_train.ravel())

In [None]:
Y_pred = regressor.predict(X_test)

In [None]:
errors = abs(Y_pred - y_test.ravel())

In [None]:
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.ravel())
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
finalDataSet.shape

In [None]:
Y_pred = regressor.predict(x)

## recovering missing data

In [None]:
testDataSet = dataSet[dataSet.isna().any(axis=1)]

In [None]:
trainingDataSet = dataSet[~dataSet.weight.isin(['NaN'])]