In [1]:
#score_model.py (2 points)

"""
When this is called using python score_model.py in the command line, this will 

ingest the .pkl random forest file and 
apply the model to the locally saved scoring dataset csv. 

There must be data check steps and clear commenting for each step inside the .py file. 

The output for running this file is a csv file with the predicted score, 
as well as a png or text file output that contains the model accuracy report 
(e.g. sklearn's classification report or any other way of model evaluation).
"""

"\nWhen this is called using python score_model.py in the command line, this will \n\ningest the .pkl random forest file and \napply the model to the locally saved scoring dataset csv. \n\nThere must be data check steps and clear commenting for each step inside the .py file. \n\nThe output for running this file is a csv file with the predicted score, \nas well as a png or text file output that contains the model accuracy report \n(e.g. sklearn's classification report or any other way of model evaluation).\n"

In [2]:
#packages
try:
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    from sklearn.model_selection import train_test_split
    import joblib
    print("Packages imported correctly.")
except:
    print("One or more packages did not load correctly.  Please make sure all needed packages are installed.")

Packages imported correctly.


In [3]:
#####################
#ingest .pkl file
####################

#read in
try:
    model = joblib.load('model.pkl')
    print("Model loaded correctly.")
except:
    print("Error: Model failed to load.")

#check model loaded
print(model)

Model loaded correctly.
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=None, min_impurity_split=1e-07,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)




In [4]:
#############################
# apply model to locally saved scoring dataset
#############################

#bring in data
try:
    df_test= pd.read_csv('test.csv')
    print("Test data loaded correctly.")
except:
    print("Test data did NOT load correctly.")
    
#validate data
print(df_test)

Test data loaded correctly.
     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Cabin Embarked  
0   

In [5]:
# test is 418 rows x 11 columns
#print(len(df_test) == 418)
#print(len(df_test.columns) == 11)

test_columns = (list(df_test.columns) == list(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']))
#print(test_columns)

In [6]:
#set checkData condition
checkData = (
            (len(df_test) == 418) & 
            (len(df_test.columns) == 11) & 
            test_columns
            )

#print results
if(checkData):
    print("The test data imported correctly.")
else:
    print("Error: the test data did NOT import correctly.")

The test data imported correctly.


In [7]:
###################################
# data cleaning and imputation - match same as training data
###################################

#split categorical into dummy
try:
    df_test_sub = df_test[['Pclass', 'Age','Parch', 'Fare', 'Sex', 'SibSp', 'Embarked']]
    df_test_d = pd.get_dummies(df_test_sub)
    print("Test data successfully transformed with dummy columns.")
except:
    print("Test data did not get dummy values correctly.  Please review.")

Test data successfully transformed with dummy columns.


In [8]:
#show dummy data
print(df_test_d)

     Pclass   Age  Parch      Fare  SibSp  Sex_female  Sex_male  Embarked_C  \
0         3  34.5      0    7.8292      0           0         1           0   
1         3  47.0      0    7.0000      1           1         0           0   
2         2  62.0      0    9.6875      0           0         1           0   
3         3  27.0      0    8.6625      0           0         1           0   
4         3  22.0      1   12.2875      1           1         0           0   
..      ...   ...    ...       ...    ...         ...       ...         ...   
413       3   NaN      0    8.0500      0           0         1           0   
414       1  39.0      0  108.9000      0           1         0           1   
415       3  38.5      0    7.2500      0           0         1           0   
416       3   NaN      0    8.0500      0           0         1           0   
417       3   NaN      1   22.3583      1           0         1           1   

     Embarked_Q  Embarked_S  
0             1      

In [9]:
# impute missing values
# in test data, Age has missing values (86), and Fare (1)
print("Initial Missing Data")
print(df_test_d.isnull().sum())

Initial Missing Data
Pclass         0
Age           86
Parch          0
Fare           1
SibSp          0
Sex_female     0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64


In [10]:
#impute with medians from training data
#read train data
try:
    df_train = pd.read_csv('train.csv')
    print("Train data loaded succesfully for test data imputation.")
except:
    print("Train data did NOT load successfully for test data imputation. Please check that data is available, needed packages are installed, and functions are available.")

Train data loaded succesfully for test data imputation.


In [11]:
#impute
try:
    df_test_d.fillna(df_train.median(), inplace=True)
    print("Imputation of missing values successfull.")
except:
    print("Imputation of missing values failed.")

Imputation of missing values successfull.


In [12]:
#validation: check for missing values after imputation
try:
    if(df_test_d.shape[0] - df_test_d.dropna().shape[0]) > 0:
        print("Error: Missing values still exist in df_test_d.")
    else:
        print("No missing values in df_test_d.")
except:
    print("Error: check for missing values in df_test_d failed.")

No missing values in df_test_d.


In [13]:
#feature engineering: create CabinMissing column to capture this information
df_test_d['CabinMissing'] = df_test['Cabin'].isnull().astype(int)

In [14]:
# feature engineering: create categorical Pclass: 1, 2, 3
# treat classes as categories and not as numeric
df_test_d['Pclass_1'] = (df_test['Pclass']==1).astype(int)
df_test_d['Pclass_2'] = (df_test['Pclass']==2).astype(int)
df_test_d['Pclass_3'] = (df_test['Pclass']==3).astype(int)

In [15]:
#set data/target columns
data_cols = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Age', 'Parch', 'Fare', 'SibSp', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S','CabinMissing']

In [16]:
#only include data for model
X = df_test_d[data_cols]

In [17]:
#make prediction
try:
    y_predict = model.predict(X)
    print("Model prediction step successfull.")
except:
    print("Error: model prediction on test data failed.")

Model prediction step successfull.


In [18]:
#check length of prediction
try:
    if(len(y_predict)==418):
        print("Model prediction output correct number of predictions.")
    else:
        print("Model predicted failed to output correct number of predictions.")
except:
    print("Error: model did NOT correctly output predictions.")

Model prediction output correct number of predictions.


In [19]:
##########################################
# output: csv file with predicted score
###########################################

In [20]:
scored = df_test

In [21]:
scored['Survived_Prediction'] = y_predict

In [22]:
#validate assignment
print(scored)

     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Cabin Embarked  \
0      male  34.5      0      0

In [23]:
#output
try:
    scored.to_csv('scored.csv',index=False)
    print("Scored file successfully saved locally.")
except:
    print("Scored file did NOT successfully save locally.")

Scored file successfully saved locally.


In [24]:
################################################################################
# output: png/text file with model accuracy report: sklearn classification report
#############################################################################

# Note: see train_model.py file for the code that supports this.