In [1]:
import sagemaker
import boto3
from sagemaker import get_execution_role
import pandas as pd
import numpy as np
import sys
import typing


region = boto3.Session().region_name
session = sagemaker.Session()
sm = boto3.Session().client(service_name='sagemaker',region_name=region)

#ins_argument = imp.spec_file_from_file_location("instance", "unit_risk_interns/src/preprocess/helpers/instance.py")
#ins = imp.module_from_spec(ins_argument)
sys.path.extend(['../src/preprocess', '../config'])
import helpers.instance as ins
import helpers.s3 as s3_helper
import helpers.utils as ut
import helpers.athena as at

# READ YAML FILES AND STORE RELEVANT PATHS in dictionaries
dataset_cfg = ins.read_config('../config/datasets.yaml') 
config_cfg = ins.read_config('../config/config.yaml')
sql_cfg = ins.read_config('../config/sql.yaml')

input_files = [dataset_cfg['raw']['repair_base']['input'], dataset_cfg['raw']['base_query']['input'],dataset_cfg['raw']['engine_hours']['input'], 
                dataset_cfg['raw']['postsaleissue']['input']]
output_paths = [dataset_cfg['raw']['repair_base']['output_paths'], dataset_cfg['raw']['base_query']['output_paths'], dataset_cfg['raw']['engine_hours']['output_paths'], 
                dataset_cfg['raw']['postsaleissue']['output_paths']]

In [2]:
cleaned_base_query = s3_helper.read_parquet_from_path('s3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Processed/base_query_prepped')
cleaned_repair_base = s3_helper.read_parquet_from_path('s3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Processed/repair_base_cleaned')
join_on = ['unit_num', 'partition_key']

In [3]:
def left_join(cleaned_df1, cleaned_df2, keys):
    combined_df = pd.merge(cleaned_df1, cleaned_df2, how = 'left', left_on = keys, right_on = keys)
    #new_path = 's3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Processed/base_prepped_joined'
    return combined_df

In [4]:
base_prepped_joined_prepared = left_join(cleaned_base_query, cleaned_repair_base, join_on)
base_prepped_joined_prepared.head()

Unnamed: 0,unit_num,partition_key,vehicle_age_total_months,region,area,district,location,unit_make_code,unit_model,unit_model_year,...,flag_datatype,miles,labor_hours,parts_cost,labor_cost,outside_cost,major_pm,big_repair,accidentsinci,total_repairs
0,100001,201801,44,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,model_build,46108,1.0,0.0,49.06,0.0,1.0,0.0,0.0,0.0
1,100001,201802,45,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,model_build,46108,0.0,0.0,0.0,13.37,0.0,0.0,0.0,0.0
2,100001,201803,46,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,model_build,48116,1.6,7.41,74.82,0.0,1.0,0.0,0.0,0.0
3,100001,201804,47,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,model_build,48116,0.0,0.0,0.0,28.0,0.0,0.0,0.0,0.0
4,100001,201805,48,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,model_build,50347,2.5,0.0,118.35,0.0,1.0,0.0,0.0,0.0


In [5]:
replacing = ['labor_hours', 'parts_cost', 'labor_cost', 'outside_cost', 'major_pm', 'big_repair', 'accidentsinci', 'total_repairs']



def removing_null(df, column_list):
    '''
    Notes: Using specified columns, replace all empty cells with 0
    '''
    for column in column_list:
        df[column].fillna(0, inplace=True)
    return df

In [6]:
removing_null(base_prepped_joined_prepared, replacing).head()

Unnamed: 0,unit_num,partition_key,vehicle_age_total_months,region,area,district,location,unit_make_code,unit_model,unit_model_year,...,flag_datatype,miles,labor_hours,parts_cost,labor_cost,outside_cost,major_pm,big_repair,accidentsinci,total_repairs
0,100001,201801,44,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,model_build,46108,1.0,0.0,49.06,0.0,1.0,0.0,0.0,0.0
1,100001,201802,45,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,model_build,46108,0.0,0.0,0.0,13.37,0.0,0.0,0.0,0.0
2,100001,201803,46,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,model_build,48116,1.6,7.41,74.82,0.0,1.0,0.0,0.0,0.0
3,100001,201804,47,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,model_build,48116,0.0,0.0,0.0,28.0,0.0,0.0,0.0,0.0
4,100001,201805,48,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,model_build,50347,2.5,0.0,118.35,0.0,1.0,0.0,0.0,0.0


In [7]:
base_prepped_joined_prepared['parts_cost'].head()

0    0.00
1    0.00
2    7.41
3    0.00
4    0.00
Name: parts_cost, dtype: float64

In [8]:
cleaned_roadcalls = s3_helper.read_parquet_from_path('s3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Processed/roadcalls_prepped') 
base_repair_roadcall = left_join(base_prepped_joined_prepared, cleaned_roadcalls, join_on)
base_repair_roadcall.head()

Unnamed: 0,unit_num,partition_key,vehicle_age_total_months,region,area,district,location,unit_make_code,unit_model,unit_model_year,...,outside_cost,major_pm,big_repair,accidentsinci,total_repairs,road_call_cnt,downtime,handle_time,customer_rebilled_cnt,po_amt
0,100001,201801,44,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,0.0,1.0,0.0,0.0,0.0,,,,,
1,100001,201802,45,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,13.37,0.0,0.0,0.0,0.0,,,,,
2,100001,201803,46,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,0.0,1.0,0.0,0.0,0.0,,,,,
3,100001,201804,47,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,28.0,0.0,0.0,0.0,0.0,,,,,
4,100001,201805,48,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,0.0,1.0,0.0,0.0,0.0,,,,,


In [9]:
join = ['unit_num']
#not totally done
cleaned_job_oil = s3_helper.read_parquet_from_path('s3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Processed/job_oil_cleaned')
base_repair_roadcall_oil = left_join(base_repair_roadcall, cleaned_job_oil, join)
base_repair_roadcall_oil.head()

Unnamed: 0,unit_num,partition_key,vehicle_age_total_months,region,area,district,location,unit_make_code,unit_model,unit_model_year,...,major_pm,big_repair,accidentsinci,total_repairs,road_call_cnt,downtime,handle_time,customer_rebilled_cnt,po_amt,critical_oil_sample_percentage
0,100001,201801,44,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,1.0,0.0,0.0,0.0,,,,,,
1,100001,201802,45,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,0.0,0.0,0.0,0.0,,,,,,
2,100001,201803,46,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,1.0,0.0,0.0,0.0,,,,,,
3,100001,201804,47,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,0.0,0.0,0.0,0.0,,,,,,
4,100001,201805,48,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,1.0,0.0,0.0,0.0,,,,,,


In [10]:
cleaned_job_oil = s3_helper.read_parquet_from_path('s3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Processed/job_oil_cleaned')
base_repair_roadcall_oil_prep = left_join(base_repair_roadcall, cleaned_job_oil, join)
#base_repair_roadcall_oil_prep.head()

In [11]:
replacing_cont = ['partition_key','road_call_cnt', 'downtime', 'handle_time', 'customer_rebilled_cnt', 'po_amt']
base_repair_roadcall_oil_prep = s3_helper.read_parquet_from_path('s3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Processed/base_repair_roadcall_oil_prep')

In [12]:
removing_null(base_repair_roadcall_oil_prep, replacing_cont)

Unnamed: 0,unit_num,partition_key,vehicle_age_total_months,region,area,district,location,unit_make_code,unit_model,unit_model_year,...,major_pm,big_repair,accidentsinci,total_repairs,road_call_cnt,downtime,handle_time,customer_rebilled_cnt,po_amt,critical_oil_sample_percentage
0,100001,201801,44,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,100001,201802,45,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,100001,201803,46,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,100001,201804,47,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,100001,201805,48,0832-WESTERN REGION,0642-MOUNTAIN,0756-10-SOUTH LAS VEGAS,0756-10-SOUTH LAS VEGAS,GMC,G33705,2014,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903185,ZSS63809,202101,49,0006-CORPORATE,0999-ADMINISTRATION AGGREGATE,6990-10-PENSKE FLEET MANAGEMENT,6990-10-PENSKE FLEET MANAGEMENT,HIN,268,2017,...,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,
4903186,ZSS63809,202102,50,0006-CORPORATE,0999-ADMINISTRATION AGGREGATE,6990-10-PENSKE FLEET MANAGEMENT,6990-10-PENSKE FLEET MANAGEMENT,HIN,268,2017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4903187,ZSS63809,202103,51,0006-CORPORATE,0999-ADMINISTRATION AGGREGATE,6990-10-PENSKE FLEET MANAGEMENT,6990-10-PENSKE FLEET MANAGEMENT,HIN,268,2017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4903188,ZSS63809,202104,52,0006-CORPORATE,0999-ADMINISTRATION AGGREGATE,6990-10-PENSKE FLEET MANAGEMENT,6990-10-PENSKE FLEET MANAGEMENT,HIN,268,2017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [13]:
base_repair_roadcall_oil_prep['partition_key']

0          201801
1          201802
2          201803
3          201804
4          201805
            ...  
4903185    202101
4903186    202102
4903187    202103
4903188    202104
4903189    202105
Name: partition_key, Length: 4903190, dtype: int64

In [14]:
base_repair_roadcall_oil_prep = s3_helper.read_parquet_from_path('s3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Processed/base_repair_roadcall_oil_prep').sample(n=100000)

In [15]:
cleaned_pm = s3_helper.read_parquet_from_path('s3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Processed/pm_combined_prepped')
base_repair_roadcall_oil_prep_joined = left_join(base_repair_roadcall_oil_prep, cleaned_pm, join)
base_repair_roadcall_oil_prep_joined

Unnamed: 0,unit_num,partition_key,vehicle_age_total_months,region,area,district,location,unit_make_code,unit_model,unit_model_year,...,big_repair,accidentsinci,total_repairs,road_call_cnt,downtime,handle_time,customer_rebilled_cnt,po_amt,critical_oil_sample_percentage,pm_early_ontime_percent
0,91602064,202001,42,0007-T1W,0997-TRUCK ONE WAY,0723-10-ONE WAY,0723-10-ONE WAY,GMC,G33903,2016,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,71.428571
1,690125,202005,75,0006-CORPORATE,0999-ADMINISTRATION AGGREGATE,6335-10-USED TRUCK STAGING,6335-10-USED TRUCK STAGING,FTL,M2,2014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.384615,33.333333
2,AL4499,201809,141,0894-SOUTH CENTRAL REGION,0612-CENTRAL,7110-10-DES MOINES,7110-10-DES MOINES,GMC,C6C042,2007,...,0.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.000000,12.500000
3,9267675,201911,30,0007-T1W,0997-TRUCK ONE WAY,0723-10-ONE WAY,0723-10-ONE WAY,IHC,4300,2018,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,
4,8507086,202003,117,0832-WESTERN REGION,0610-NORTHWEST,0522-10-GOLDEN GATE,0522-10-GOLDEN GATE,FTL,X11364ST,2011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,50.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,ABF82271,202109,207,0835-NORTH CENTRAL REGION,0608-MIDWEST,0078-10-CINCINNATI,7611-10-ABF DAYTON,GDT,ALUMVAN,2005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
99996,D2701409,201903,153,0835-NORTH CENTRAL REGION,0608-MIDWEST,5811-10-ELKHART,5811-10-ELKHART,GDT,ALUMVAN,2007,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,100.000000
99997,8504744,202106,60,0833-SOUTHEAST REGION,0607-FLORIDA,0727-10-TAMPA PALM RIVER,0356-10-SHADOWLAWN,UTL,ALUMVAN,2017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,80.000000
99998,673433,201801,57,0834-NORTHEAST REGION,0611-METRO NEW YORK,0449-10-NORTH BERGEN,6550-10-ELMSFORD,ISU,NQR,2013,...,0.0,1.0,9.0,0.0,0.0,0.0,0.0,0.0,52.941176,81.250000


In [16]:
print(type(cleaned_pm))
print(type(base_repair_roadcall_oil_prep))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [17]:
cleaned_bill_date = s3_helper.read_parquet_from_path('s3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Processed/bill_date_prepped')
base_repair_roadcall_oil_pm_bill = left_join(base_repair_roadcall_oil_prep_joined, cleaned_bill_date, join)
base_repair_roadcall_oil_pm_bill

Unnamed: 0,unit_num,partition_key,vehicle_age_total_months,region,area,district,location,unit_make_code,unit_model,unit_model_year,...,total_repairs,road_call_cnt,downtime,handle_time,customer_rebilled_cnt,po_amt,critical_oil_sample_percentage,pm_early_ontime_percent,last_billed_date,billed_pk
0,91602064,202001,42,0007-T1W,0997-TRUCK ONE WAY,0723-10-ONE WAY,0723-10-ONE WAY,GMC,G33903,2016,...,0.0,0.0,0.0,0.0,0.0,0.0,,71.428571,2019-11-08,201911.0
1,690125,202005,75,0006-CORPORATE,0999-ADMINISTRATION AGGREGATE,6335-10-USED TRUCK STAGING,6335-10-USED TRUCK STAGING,FTL,M2,2014,...,0.0,0.0,0.0,0.0,0.0,0.0,15.384615,33.333333,2019-08-25,201908.0
2,AL4499,201809,141,0894-SOUTH CENTRAL REGION,0612-CENTRAL,7110-10-DES MOINES,7110-10-DES MOINES,GMC,C6C042,2007,...,6.0,0.0,0.0,0.0,0.0,0.0,0.000000,12.500000,NaT,
3,9267675,201911,30,0007-T1W,0997-TRUCK ONE WAY,0723-10-ONE WAY,0723-10-ONE WAY,IHC,4300,2018,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,,2022-02-11,202202.0
4,8507086,202003,117,0832-WESTERN REGION,0610-NORTHWEST,0522-10-GOLDEN GATE,0522-10-GOLDEN GATE,FTL,X11364ST,2011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,50.000000,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,ABF82271,202109,207,0835-NORTH CENTRAL REGION,0608-MIDWEST,0078-10-CINCINNATI,7611-10-ABF DAYTON,GDT,ALUMVAN,2005,...,0.0,0.0,0.0,0.0,0.0,0.0,,,NaT,
99996,D2701409,201903,153,0835-NORTH CENTRAL REGION,0608-MIDWEST,5811-10-ELKHART,5811-10-ELKHART,GDT,ALUMVAN,2007,...,0.0,0.0,0.0,0.0,0.0,0.0,,100.000000,2019-12-27,201912.0
99997,8504744,202106,60,0833-SOUTHEAST REGION,0607-FLORIDA,0727-10-TAMPA PALM RIVER,0356-10-SHADOWLAWN,UTL,ALUMVAN,2017,...,0.0,0.0,0.0,0.0,0.0,0.0,,80.000000,NaT,
99998,673433,201801,57,0834-NORTHEAST REGION,0611-METRO NEW YORK,0449-10-NORTH BERGEN,6550-10-ELMSFORD,ISU,NQR,2013,...,9.0,0.0,0.0,0.0,0.0,0.0,52.941176,81.250000,2019-04-25,201904.0


In [18]:
df= base_repair_roadcall_oil_pm_bill

In [19]:
print(df['partition_key'])

0        202001
1        202005
2        201809
3        201911
4        202003
          ...  
99995    202109
99996    201903
99997    202106
99998    201801
99999    202004
Name: partition_key, Length: 100000, dtype: int64


In [20]:
#combined
def editing_final_data(file):
    df_current = file
    df_current['unit_sold_pk'] = df_current['unit_sold_date'].dt.year *100 + df_current['unit_sold_date'].dt.month
    df_current[df_current['partition_key']<=df_current['unit_sold_pk']]
    df_current['billed_pk'].fillna(0, inplace=True)
    df_current.sort_values(by=['unit_num', 'partition_key'], ascending=(False, False), inplace=True)

    # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
    '''
    - Convert partition_key to date
    - Get timelapse between last record in EOM and current record, for grouping purpose in final query
    '''
    df_current['partition_date'] = pd.to_datetime(df_current['partition_key'], format='%Y%m')
    df_current['last_date'] = df_current.groupby('unit_num')['partition_date'].transform('first')
    df_current['time_lapse_month'] = np.round((df_current['last_date'] - df_current['partition_date'])/np.timedelta64(1,'M'))

    # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
    '''
    Dropping unnecesary columns and filling categorical null values in columns with mode of columns
    '''
    df_current.drop(columns=['last_billed_date', 'last_date', 'partition_date'], inplace=True)
    
    mode_val_fill = ['unit_size', 'unit_fuel_type']
    for col in mode_val_fill:
        df_current[col].fillna(df_current[col].mode()[0],inplace=True)
    combined_df = df_current 
    return combined_df

In [21]:
prefinal_data = editing_final_data(df)
prefinal_data

Unnamed: 0,unit_num,partition_key,vehicle_age_total_months,region,area,district,location,unit_make_code,unit_model,unit_model_year,...,road_call_cnt,downtime,handle_time,customer_rebilled_cnt,po_amt,critical_oil_sample_percentage,pm_early_ontime_percent,billed_pk,unit_sold_pk,time_lapse_month
47361,ZSS63809,202101,49,0006-CORPORATE,0999-ADMINISTRATION AGGREGATE,6990-10-PENSKE FLEET MANAGEMENT,6990-10-PENSKE FLEET MANAGEMENT,HIN,268,2017,...,0.0,0.0,0.0,0.0,0.0,,,0.0,202102,0.0
36849,ZSC60581,202202,62,0006-CORPORATE,0999-ADMINISTRATION AGGREGATE,6990-10-PENSKE FLEET MANAGEMENT,6990-10-PENSKE FLEET MANAGEMENT,FOR,F350,2017,...,0.0,0.0,0.0,0.0,0.0,,,0.0,202204,0.0
63444,ZSC60581,202004,40,0006-CORPORATE,0999-ADMINISTRATION AGGREGATE,6990-10-PENSKE FLEET MANAGEMENT,6990-10-PENSKE FLEET MANAGEMENT,FOR,F350,2017,...,0.0,0.0,0.0,0.0,0.0,,,0.0,202204,22.0
85167,ZSC25977,202204,4,0006-CORPORATE,0999-ADMINISTRATION AGGREGATE,6990-10-PENSKE FLEET MANAGEMENT,6990-10-PENSKE FLEET MANAGEMENT,FOR,F350,2022,...,0.0,0.0,0.0,0.0,0.0,,,0.0,202204,0.0
57374,ZSB70901,202204,88,0006-CORPORATE,0999-ADMINISTRATION AGGREGATE,6990-10-PENSKE FLEET MANAGEMENT,6990-10-PENSKE FLEET MANAGEMENT,FOR,F250,2015,...,0.0,0.0,0.0,0.0,0.0,,,0.0,202203,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22896,100016,201807,52,0835-NORTH CENTRAL REGION,0608-MIDWEST,0470-10-LOUISVILLE EAST,0456-10-LOUISVILLE,FTL,X12564ST,2015,...,0.0,0.0,0.0,0.0,0.0,7.692308,67.857143,202202.0,202203,0.0
7387,100008,201904,61,0832-WESTERN REGION,0610-NORTHWEST,0522-10-GOLDEN GATE,0691-10-SANTA ROSA,FTL,X12564ST,2015,...,0.0,0.0,0.0,0.0,0.0,7.692308,65.384615,202012.0,202101,0.0
65272,100008,201803,48,0832-WESTERN REGION,0610-NORTHWEST,0522-10-GOLDEN GATE,0691-10-SANTA ROSA,FTL,X12564ST,2015,...,0.0,0.0,0.0,0.0,0.0,7.692308,65.384615,202012.0,202101,13.0
57390,100006,201906,64,0832-WESTERN REGION,0610-NORTHWEST,0522-10-GOLDEN GATE,0685-10-SAN FRANCISCO,FTL,X12564ST,2015,...,0.0,0.0,0.0,0.0,0.0,0.000000,66.666667,202004.0,202006,0.0


In [22]:
%pip install pandasql
from pandasql import sqldf

def finalized_data(file):
    df_current = file
    df_current['total_cost'] = df_current['parts_cost'] + df_current['labor_cost'] + df_current['outside_cost']

    # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
    merged = sqldf('''select 
        unit_num, 
        sum(case when time_lapse_month between 1 and 24 then total_cost else 0 end) as Tot_cost_24,
        sum(case when time_lapse_month between 1 and 12 then total_cost else 0 end) as Tot_cost_12,
        sum(case when time_lapse_month between 1 and 6 then total_cost else 0 end) as Tot_cost_6,
        sum(case when time_lapse_month between 1 and 3 then total_cost else 0 end) as Tot_cost_3,
        avg(case when time_lapse_month between 1 and 24 then total_cost else 0 end) as AvgT_cost_24,
        avg(case when time_lapse_month between 1 and 12 then total_cost else 0 end) as AvgT_cost_12,
        avg(case when time_lapse_month between 1 and 6 then total_cost else 0 end) as AvgT_cost_6,
        avg(case when time_lapse_month between 1 and 3 then total_cost else 0 end) as AvgT_cost_3,
        sum(case when time_lapse_month between 1 and 12 then outside_cost else 0 end) as Outside_cost_12,
        sum(case when time_lapse_month between 1 and 12 then handle_time else 0 end ) as total_time_call_12,
        sum(case when time_lapse_month between 1 and 12 then road_call_cnt else 0 end ) as total_call_12,
        sum(case when time_lapse_month between 1 and 12 then po_amt else 0 end ) as total_po_sum_12,
        sum(case when time_lapse_month between 1 and 12 then customer_rebilled_cnt else 0 end ) as time_customer_billed_12,

        sum(total_cost) as LTD_Total_cost,
        sum(labor_cost) as LTD_Labour_cost,
        sum(outside_cost) as LTD_Outside_cost,
        sum(parts_cost) as LTD_Parts_cost,
        sum(big_repair) as LTD_Big_repair,
        sum(major_pm) as LTD_Major_PM,
        sum(road_call_cnt) as LTD_Road_calls,
        sum(handle_time) as LTD_Handle_time,
        sum(total_repairs) as LTD_TOTAL_REPAIRS,
        sum(customer_rebilled_cnt) as LTD_customer_rebilled,
        sum(po_amt) as LTD_PO_Amount,
        sum(LABOR_HOURS) as LTD_Labour_hours,
        sum(accidentsinci) as LTD_Accients_incidents,
        avg(pm_early_ontime_percent) as avg_pm_ontime_percent,
        avg(critical_oil_sample_percentage) as avg_critical_oil_sample_percent,

        (sum(case when time_lapse_month between 0 and 6 then total_cost else 0 end) - sum(case when time_lapse_month between 7 and 12 then total_cost else 0 end) ) diff_prev6,
        (sum(case when time_lapse_month between 0 and 12 then total_cost else 0 end) - sum(case when time_lapse_month between 13 and 24 then total_cost else 0 end) ) diff_prev12,
        (sum(case when time_lapse_month between 0 and 3 then total_cost else 0 end) - sum(case when time_lapse_month between 4 and 6 then total_cost else 0 end) ) diff_prev3,
        avg(case when time_lapse_month == 0 then miles end) - avg(case when time_lapse_month == 6 then miles end) as milesdiff_1_6,
        (avg(case when time_lapse_month == 0 then total_cost end) - avg(case when time_lapse_month == 6 then total_cost end))/(avg(case when time_lapse_month == 0 then MILES end) - avg(case when time_lapse_month == 6 then MILES end)) as costPermile_1diff6,
        -- avg(case when time_lapse_month == 0 then ENGINE_HRS end) - avg(case when final_rank == 6 then ENGINE_HRS end) as Enginehrs_diff_1_6,
        (sum(case when time_lapse_month between 0 and 6 then total_repairs else 0 end) - sum(case when time_lapse_month between 7 and 12 then total_repairs else 0 end) ) Repairs_diff_prev6,
        -- avg( case when time_lapse_month == 0 then miles end)/avg(case when time_lapse_month == 0 then ENGINE_HRS end) as  cycle_time,

        -- (avg( case when time_lapse_month == 0 then miles end)/avg(case when time_lapse_month == 0 then engine_hrs end) - avg( case when time_lapse_month == 7 then miles end)/avg(case when time_lapse_month == 7 then engine_hrs end) ) as duty_cycle_change6months,
        avg(case when time_lapse_month == 0 then miles end) as Curr_miles,
        -- avg(case when time_lapse_month == 0 then engine_hrs end ) as curr_engine_hrs,
        avg(case when time_lapse_month == 0 then vehicle_age_total_months end ) as cur_vintage,

        sum(case when time_lapse_month between 0 and 24 then major_pm end) as MajorPM_24_sum,
        count(distinct location) as dloc_nt, 
        count(distinct region ) as dregion_cnt, 
        count(distinct area) as darea_cnt ,

        max(case when time_lapse_month = 0 then location end ) as LOCATION,
        max(case when time_lapse_month = 0 then area end ) as AREA,
        max(case when time_lapse_month = 0 then region end) as REGION,
        max(case when time_lapse_month = 0 then district end) as DISTRICT,
        max(case when time_lapse_month = 0 then unit_category end) as UNIT_CATEGORY,
        max(case when time_lapse_month = 0 then unit_size end) as UNIT_SIZE,
        max(case when time_lapse_month = 0 then product_line end) as PRODUCT_LINE,
        max(case when time_lapse_month = 0 then unit_make_code end) as UNIT_MAKE_CODE,
        max(case when time_lapse_month = 0 then unit_model_year end) as UNIT_MODEL_YEAR,
        max(case when time_lapse_month = 0 then unit_fuel_type end ) as UNIT_FUEL_TYPE,
        max(case when time_lapse_month = 0 then unit_body_type end ) as UNIT_BODY_TYPE,
        max(unit_sold_date) as UNIT_SOLD_DATE
    from 
        df_current
    group by unit_num
    ''')

    # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
    finalized_data_df = merged # For this sample code, simply copy input to output
    return finalized_data_df

[0mNote: you may need to restart the kernel to use updated packages.


In [23]:
final_data = finalized_data(prefinal_data)
final_data

Unnamed: 0,unit_num,Tot_cost_24,Tot_cost_12,Tot_cost_6,Tot_cost_3,AvgT_cost_24,AvgT_cost_12,AvgT_cost_6,AvgT_cost_3,Outside_cost_12,...,REGION,DISTRICT,UNIT_CATEGORY,UNIT_SIZE,PRODUCT_LINE,UNIT_MAKE_CODE,UNIT_MODEL_YEAR,UNIT_FUEL_TYPE,UNIT_BODY_TYPE,UNIT_SOLD_DATE
0,100001,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0832-WESTERN REGION,0756-10-SOUTH LAS VEGAS,TRUCK,LIGHT,LEASE,GMC,2014,GAS,2000 06,2019-09-16 00:00:00.000000
1,100006,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0832-WESTERN REGION,0522-10-GOLDEN GATE,TRACTOR,HEAVY,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2020-06-26 00:00:00.000000
2,100008,880.68,0.0,0.0,0.0,440.34,0.0,0.0,0.0,0.0,...,0832-WESTERN REGION,0522-10-GOLDEN GATE,TRACTOR,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2021-01-29 00:00:00.000000
3,100016,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0835-NORTH CENTRAL REGION,0470-10-LOUISVILLE EAST,TRACTOR,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2022-03-22 00:00:00.000000
4,100021,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0006-CORPORATE,6335-10-USED TRUCK STAGING,TRACTOR,HEAVY,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2021-04-28 00:00:00.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72671,ZSA96462,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0006-CORPORATE,6990-10-PENSKE FLEET MANAGEMENT,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2015,DIESEL,1300 24,2022-04-25 00:00:00.000000
72672,ZSB70901,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0006-CORPORATE,6990-10-PENSKE FLEET MANAGEMENT,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2015,DIESEL,1300 24,2022-03-09 00:00:00.000000
72673,ZSC25977,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0006-CORPORATE,6990-10-PENSKE FLEET MANAGEMENT,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2022,GAS,1900 01,2022-04-25 00:00:00.000000
72674,ZSC60581,98.80,0.0,0.0,0.0,49.40,0.0,0.0,0.0,0.0,...,0006-CORPORATE,6990-10-PENSKE FLEET MANAGEMENT,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2017,DIESEL,1300 24,2022-04-25 00:00:00.000000


In [24]:
cleaned_psi = s3_helper.read_parquet_from_path('s3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Processed/postsaleissue_cleaned')
with_target = left_join(final_data, cleaned_psi, join)
with_target

Unnamed: 0,unit_num,Tot_cost_24,Tot_cost_12,Tot_cost_6,Tot_cost_3,AvgT_cost_24,AvgT_cost_12,AvgT_cost_6,AvgT_cost_3,Outside_cost_12,...,UNIT_CATEGORY,UNIT_SIZE,PRODUCT_LINE,UNIT_MAKE_CODE,UNIT_MODEL_YEAR,UNIT_FUEL_TYPE,UNIT_BODY_TYPE,UNIT_SOLD_DATE,unit_sold_date,flag_cameback
0,100001,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,LEASE,GMC,2014,GAS,2000 06,2019-09-16 00:00:00.000000,,
1,100006,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRACTOR,HEAVY,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2020-06-26 00:00:00.000000,,
2,100008,880.68,0.0,0.0,0.0,440.34,0.0,0.0,0.0,0.0,...,TRACTOR,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2021-01-29 00:00:00.000000,,
3,100016,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRACTOR,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2022-03-22 00:00:00.000000,2022-03-22 00:00:00.000,1.0
4,100021,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRACTOR,HEAVY,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2021-04-28 00:00:00.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72671,ZSA96462,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2015,DIESEL,1300 24,2022-04-25 00:00:00.000000,,
72672,ZSB70901,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2015,DIESEL,1300 24,2022-03-09 00:00:00.000000,,
72673,ZSC25977,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2022,GAS,1900 01,2022-04-25 00:00:00.000000,,
72674,ZSC60581,98.80,0.0,0.0,0.0,49.40,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2017,DIESEL,1300 24,2022-04-25 00:00:00.000000,,


In [25]:
dataset = with_target.query("UNIT_CATEGORY in ('TRUCK', 'TRACTOR', 'TRAILER') ")
dataset

Unnamed: 0,unit_num,Tot_cost_24,Tot_cost_12,Tot_cost_6,Tot_cost_3,AvgT_cost_24,AvgT_cost_12,AvgT_cost_6,AvgT_cost_3,Outside_cost_12,...,UNIT_CATEGORY,UNIT_SIZE,PRODUCT_LINE,UNIT_MAKE_CODE,UNIT_MODEL_YEAR,UNIT_FUEL_TYPE,UNIT_BODY_TYPE,UNIT_SOLD_DATE,unit_sold_date,flag_cameback
0,100001,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,LEASE,GMC,2014,GAS,2000 06,2019-09-16 00:00:00.000000,,
1,100006,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRACTOR,HEAVY,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2020-06-26 00:00:00.000000,,
2,100008,880.68,0.0,0.0,0.0,440.34,0.0,0.0,0.0,0.0,...,TRACTOR,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2021-01-29 00:00:00.000000,,
3,100016,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRACTOR,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2022-03-22 00:00:00.000000,2022-03-22 00:00:00.000,1.0
4,100021,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRACTOR,HEAVY,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2021-04-28 00:00:00.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72671,ZSA96462,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2015,DIESEL,1300 24,2022-04-25 00:00:00.000000,,
72672,ZSB70901,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2015,DIESEL,1300 24,2022-03-09 00:00:00.000000,,
72673,ZSC25977,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2022,GAS,1900 01,2022-04-25 00:00:00.000000,,
72674,ZSC60581,98.80,0.0,0.0,0.0,49.40,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2017,DIESEL,1300 24,2022-04-25 00:00:00.000000,,


In [26]:
empty_column = ['flag_cameback']
dataset_no_nulls = removing_null(dataset, empty_column)
dataset_no_nulls

Unnamed: 0,unit_num,Tot_cost_24,Tot_cost_12,Tot_cost_6,Tot_cost_3,AvgT_cost_24,AvgT_cost_12,AvgT_cost_6,AvgT_cost_3,Outside_cost_12,...,UNIT_CATEGORY,UNIT_SIZE,PRODUCT_LINE,UNIT_MAKE_CODE,UNIT_MODEL_YEAR,UNIT_FUEL_TYPE,UNIT_BODY_TYPE,UNIT_SOLD_DATE,unit_sold_date,flag_cameback
0,100001,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,LEASE,GMC,2014,GAS,2000 06,2019-09-16 00:00:00.000000,,0.0
1,100006,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRACTOR,HEAVY,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2020-06-26 00:00:00.000000,,0.0
2,100008,880.68,0.0,0.0,0.0,440.34,0.0,0.0,0.0,0.0,...,TRACTOR,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2021-01-29 00:00:00.000000,,0.0
3,100016,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRACTOR,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2022-03-22 00:00:00.000000,2022-03-22 00:00:00.000,1.0
4,100021,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRACTOR,HEAVY,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2021-04-28 00:00:00.000000,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72671,ZSA96462,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2015,DIESEL,1300 24,2022-04-25 00:00:00.000000,,0.0
72672,ZSB70901,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2015,DIESEL,1300 24,2022-03-09 00:00:00.000000,,0.0
72673,ZSC25977,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2022,GAS,1900 01,2022-04-25 00:00:00.000000,,0.0
72674,ZSC60581,98.80,0.0,0.0,0.0,49.40,0.0,0.0,0.0,0.0,...,TRUCK,LIGHT,CONTRACT MAINTENANCE - PEG,FOR,2017,DIESEL,1300 24,2022-04-25 00:00:00.000000,,0.0


In [27]:
# pd.to_datetime(dataset_no_nulls['UNIT_SOLD_DATE'], format='%Y-%m-%d %H:%M:%S.%f')
dataset_no_nulls['UNIT_SOLD_DATE'] = pd.to_datetime(dataset_no_nulls['UNIT_SOLD_DATE'], format='%Y-%m-%d')

In [28]:
dataset_no_nulls['UNIT_SOLD_DATE']

0       2019-09-16
1       2020-06-26
2       2021-01-29
3       2022-03-22
4       2021-04-28
           ...    
72671   2022-04-25
72672   2022-03-09
72673   2022-04-25
72674   2022-04-25
72675   2021-02-17
Name: UNIT_SOLD_DATE, Length: 72676, dtype: datetime64[ns]

In [29]:
ranges = pd.date_range('2019-01-01', '2021-01-01')
dataset_no_nulls['unit_sold_flag'] = np.where(dataset_no_nulls['UNIT_SOLD_DATE'].isin(ranges), 1,0)

In [30]:
dataset_no_nulls['unit_sold_flag']

0        1
1        1
2        0
3        0
4        0
        ..
72671    0
72672    0
72673    0
72674    0
72675    0
Name: unit_sold_flag, Length: 72676, dtype: int64

In [31]:
len(dataset_no_nulls['unit_sold_flag'])

72676

In [32]:
dataset_no_nulls['unit_sold_flag']

0        1
1        1
2        0
3        0
4        0
        ..
72671    0
72672    0
72673    0
72674    0
72675    0
Name: unit_sold_flag, Length: 72676, dtype: int64

In [33]:
dataset_no_nulls[dataset_no_nulls['UNIT_SOLD_DATE'].isin(ranges)]

Unnamed: 0,unit_num,Tot_cost_24,Tot_cost_12,Tot_cost_6,Tot_cost_3,AvgT_cost_24,AvgT_cost_12,AvgT_cost_6,AvgT_cost_3,Outside_cost_12,...,UNIT_SIZE,PRODUCT_LINE,UNIT_MAKE_CODE,UNIT_MODEL_YEAR,UNIT_FUEL_TYPE,UNIT_BODY_TYPE,UNIT_SOLD_DATE,unit_sold_date,flag_cameback,unit_sold_flag
0,100001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,LIGHT,LEASE,GMC,2014,GAS,2000 06,2019-09-16,,0.0,1
1,100006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2020-06-26,,0.0,1
5,100027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2020-10-05,2020-10-05 00:00:00.000,1.0,1
7,100038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2019-09-24,,0.0,1
8,100041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2020-04-29,,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72571,Z2046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2020-03-01,,0.0,1
72572,Z2056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2019-12-01,,0.0,1
72573,Z2057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,CONTRACT MAINTENANCE - PEG,FTL,2007,DIESEL,4002 13,2019-11-14,,0.0,1
72574,Z2058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2019-11-22,,0.0,1


In [34]:
dataset_set_target = dataset_no_nulls[dataset_no_nulls['UNIT_SOLD_DATE'].isin(ranges)]
dataset_set_target

Unnamed: 0,unit_num,Tot_cost_24,Tot_cost_12,Tot_cost_6,Tot_cost_3,AvgT_cost_24,AvgT_cost_12,AvgT_cost_6,AvgT_cost_3,Outside_cost_12,...,UNIT_SIZE,PRODUCT_LINE,UNIT_MAKE_CODE,UNIT_MODEL_YEAR,UNIT_FUEL_TYPE,UNIT_BODY_TYPE,UNIT_SOLD_DATE,unit_sold_date,flag_cameback,unit_sold_flag
0,100001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,LIGHT,LEASE,GMC,2014,GAS,2000 06,2019-09-16,,0.0,1
1,100006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2020-06-26,,0.0,1
5,100027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2020-10-05,2020-10-05 00:00:00.000,1.0,1
7,100038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2019-09-24,,0.0,1
8,100041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2020-04-29,,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72571,Z2046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2020-03-01,,0.0,1
72572,Z2056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2019-12-01,,0.0,1
72573,Z2057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,CONTRACT MAINTENANCE - PEG,FTL,2007,DIESEL,4002 13,2019-11-14,,0.0,1
72574,Z2058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2019-11-22,,0.0,1


In [35]:
dataset_set_target.loc[:, 'UNIT_SOLD_DATE'] = pd.to_datetime(dataset_set_target.loc[:,'UNIT_SOLD_DATE'], format='%Y-%m-%d')
#dataset_set_target['UNIT_SOLD_DATE'] = pd.to_datetime(dataset_set_target['UNIT_SOLD_DATE'], format='%Y-%m-%d')
new_ranges = pd.date_range('2021-09-01', '2021-12-31')
dataset_set_target['outvalidation_flag'] = np.where(dataset_set_target['UNIT_SOLD_DATE'].isin(new_ranges), 1,0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [36]:
dataset_set_target['outvalidation_flag']

0        0
1        0
5        0
7        0
8        0
        ..
72571    0
72572    0
72573    0
72574    0
72579    0
Name: outvalidation_flag, Length: 40846, dtype: int64

In [38]:
#del dataset_set_target['new_unit_sold_flag']

In [40]:
final_dataset_set_target = dataset_set_target
final_dataset_set_target

Unnamed: 0,unit_num,Tot_cost_24,Tot_cost_12,Tot_cost_6,Tot_cost_3,AvgT_cost_24,AvgT_cost_12,AvgT_cost_6,AvgT_cost_3,Outside_cost_12,...,PRODUCT_LINE,UNIT_MAKE_CODE,UNIT_MODEL_YEAR,UNIT_FUEL_TYPE,UNIT_BODY_TYPE,UNIT_SOLD_DATE,unit_sold_date,flag_cameback,unit_sold_flag,outvalidation_flag
0,100001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,LEASE,GMC,2014,GAS,2000 06,2019-09-16,,0.0,1,0
1,100006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2020-06-26,,0.0,1,0
5,100027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,LEASE,FTL,2015,DIESEL,4002 13,2020-10-05,2020-10-05 00:00:00.000,1.0,1,0
7,100038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,LEASE,FTL,2015,DIESEL,4002 13,2019-09-24,,0.0,1,0
8,100041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,LEASE,FTL,2015,DIESEL,4002 13,2020-04-29,,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72571,Z2046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2020-03-01,,0.0,1,0
72572,Z2056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2019-12-01,,0.0,1,0
72573,Z2057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,CONTRACT MAINTENANCE - PEG,FTL,2007,DIESEL,4002 13,2019-11-14,,0.0,1,0
72574,Z2058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2019-11-22,,0.0,1,0


In [41]:
model_train = final_dataset_set_target.loc[final_dataset_set_target['outvalidation_flag'] != 1]
model_train

Unnamed: 0,unit_num,Tot_cost_24,Tot_cost_12,Tot_cost_6,Tot_cost_3,AvgT_cost_24,AvgT_cost_12,AvgT_cost_6,AvgT_cost_3,Outside_cost_12,...,PRODUCT_LINE,UNIT_MAKE_CODE,UNIT_MODEL_YEAR,UNIT_FUEL_TYPE,UNIT_BODY_TYPE,UNIT_SOLD_DATE,unit_sold_date,flag_cameback,unit_sold_flag,outvalidation_flag
0,100001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,LEASE,GMC,2014,GAS,2000 06,2019-09-16,,0.0,1,0
1,100006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2020-06-26,,0.0,1,0
5,100027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,LEASE,FTL,2015,DIESEL,4002 13,2020-10-05,2020-10-05 00:00:00.000,1.0,1,0
7,100038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,LEASE,FTL,2015,DIESEL,4002 13,2019-09-24,,0.0,1,0
8,100041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,LEASE,FTL,2015,DIESEL,4002 13,2020-04-29,,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72571,Z2046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2020-03-01,,0.0,1,0
72572,Z2056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2019-12-01,,0.0,1,0
72573,Z2057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,CONTRACT MAINTENANCE - PEG,FTL,2007,DIESEL,4002 13,2019-11-14,,0.0,1,0
72574,Z2058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2019-11-22,,0.0,1,0


In [42]:
del model_train['outvalidation_flag']

In [43]:
train = model_train
train

Unnamed: 0,unit_num,Tot_cost_24,Tot_cost_12,Tot_cost_6,Tot_cost_3,AvgT_cost_24,AvgT_cost_12,AvgT_cost_6,AvgT_cost_3,Outside_cost_12,...,UNIT_SIZE,PRODUCT_LINE,UNIT_MAKE_CODE,UNIT_MODEL_YEAR,UNIT_FUEL_TYPE,UNIT_BODY_TYPE,UNIT_SOLD_DATE,unit_sold_date,flag_cameback,unit_sold_flag
0,100001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,LIGHT,LEASE,GMC,2014,GAS,2000 06,2019-09-16,,0.0,1
1,100006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,COMMERCIAL RENTAL,FTL,2015,DIESEL,4002 13,2020-06-26,,0.0,1
5,100027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2020-10-05,2020-10-05 00:00:00.000,1.0,1
7,100038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2019-09-24,,0.0,1
8,100041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,LEASE,FTL,2015,DIESEL,4002 13,2020-04-29,,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72571,Z2046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2020-03-01,,0.0,1
72572,Z2056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2019-12-01,,0.0,1
72573,Z2057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,CONTRACT MAINTENANCE - PEG,FTL,2007,DIESEL,4002 13,2019-11-14,,0.0,1
72574,Z2058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,HEAVY,CONTRACT MAINTENANCE - PEG,FTL,2006,DIESEL,4002 13,2019-11-22,,0.0,1


In [44]:
train.to_csv('trained_data')