In [1]:
import pandas as pd
import numpy as np
from data_stats import summary_stats
from scipy.stats import ks_2samp,f_oneway,kruskal
import scipy.stats as stats

In [2]:
if __name__=="__main__":
    summ_stat=summary_stats()
    
    ## Read csv file
    df=pd.read_csv('../data/nbl_eval.csv')
    print('Shape Of DataFrame is: ',df.shape)
    print("\n Datatype of each object:\n", df.dtypes)
    
    ## Extract Numeric and Categorical data
    df_quan,df_cat=summ_stat.extract_num_cat(df)

Shape Of DataFrame is:  (10286, 7)

 Datatype of each object:
 truckingcompanyid      int64
productid              int64
distanceinmiles      float64
fuelcost             float64
linehaulcost         float64
source                object
destination           object
dtype: object


##### Numeric:

In [3]:
df_quan_total=summ_stat.summary_quan(df_quan)
summ_stat.summary_quan(df_quan)

Unnamed: 0,Count,% Miss.,Card.,Min,1st Qrt.,Mean,Median,3rd Qrt.,Max,Std. Dev.
distanceinmiles,10286.0,0.0,48,97.32,681.24,1616.187544,1459.8,2433.0,4671.36,1109.212772
fuelcost,10286.0,0.0,574,2.15,4.85,5.499473,5.5,6.16,9.55,0.987558
linehaulcost,10286.0,0.0,9766,7.49,150.4025,830.179412,326.135,771.7375,34845.53,1696.541911


#### Categorical:

In [4]:
summ_stat.summary_cat(df_cat)

Unnamed: 0,Count,% Miss.,Card.,Mode,Mode Freq.,Mode %,2nd Mode,2nd Mode Freq.,2nd Mode %
truckingcompanyid,10286,0,6153,5367,8,0.0777756,4835,7.0,0.0680537
productid,10286,0,24,4,464,4.51099,20,461.0,4.48182
source,10286,0,49,Phoenix,242,2.35271,Miami,237.0,2.3041
destination,10286,0,49,New Orleans,262,2.54715,Las Vegas,249.0,2.42077


## Sampling

In [5]:
sample_size=[500,1000,2500,5000,7000]
df_mean_err,df_std_err=summ_stat.calc_diff_orig_sample(df,sample_size,30)

In [6]:
df_mean_err

Unnamed: 0,Orig_Mean,Mean_Err_500,Mean_Err_1000,Mean_Err_2500,Mean_Err_5000,Mean_Err_7000
distanceinmiles,1616.187544,-9.990728,-9.737696,0.011173,1.563751,-2.030415
fuelcost,5.499473,0.00043,0.000583,0.000994,-0.000112,0.000954
linehaulcost,830.179412,-9.02237,7.879758,7.987509,-2.649026,4.500585


In [7]:
df_std_err

Unnamed: 0,Orig_StdDev,Std_Dev_Err500,Std_Dev_Err1000,Std_Dev_Err2500,Std_Dev_Err5000,Std_Dev_Err7000
distanceinmiles,1109.212772,-1.676786,2.596115,3.30437,0.239005,0.279543
fuelcost,0.987558,0.008273,0.001225,0.001186,2.9e-05,7.6e-05
linehaulcost,1696.541911,53.567606,36.875275,14.123808,10.410023,6.639166


### Kolmogorov-Smirnov (K-S) Test:

In [8]:
df_1=df.sample(7000)
ks_2samp(df.distanceinmiles, df_1.distanceinmiles)
ks_2samp(df.fuelcost, df_1.fuelcost)
ks_2samp(df.linehaulcost, df_1.linehaulcost)

KstestResult(statistic=0.006091330796366756, pvalue=0.997605769962016)

In [9]:
def ks_test(df,df_sample,col_name):
    ks_stat=ks_2samp(df[col_name], df_sample[col_name]).statistic
    ks_pval=ks_2samp(df[col_name], df_sample[col_name]).pvalue
    return ks_stat, ks_pval
    

In [10]:
ks_stat=[]
ks_pval=[]
columns=["distanceinmiles","fuelcost","linehaulcost"]
for i in columns:
    ks_stat.append(ks_test(df,df_1,i)[0])
    ks_pval.append(ks_test(df,df_1,i)[1])

In [11]:
ks_test_df=pd.DataFrame([ks_stat,ks_pval],index=["KS-Stat","P-Value"],columns=columns).transpose()

In [12]:
ks_test_df

Unnamed: 0,KS-Stat,P-Value
distanceinmiles,0.003472,1.0
fuelcost,0.004233,0.999999
linehaulcost,0.006091,0.997606


## Split Training and Test Data

In [13]:
train_idx=df.sample(7000).index
test_idx=df[~df.index.isin(train_idx)].index
train_data=df.iloc[train_idx,:]
test_data=df.iloc[test_idx,:]

In [14]:
train_data.to_csv("../data/training_data.csv",index=False)
test_data.to_csv("../data/test_data.csv",index=False)