In [87]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore') 

In [88]:
# loading datasets

testdf = pd.read_csv('./dataset/test.csv', sep=r',', parse_dates=['datetime'])
traindf = pd.read_csv('./dataset/train.csv', sep=r',', parse_dates=['datetime'])
testdf.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [89]:
# datetime transformation

def datetransform(df):
    dtime_df = pd.DataFrame()
    
    dtime_df['year'] = df['datetime'].dt.year
    dtime_df['month'] = df['datetime'].dt.month
    dtime_df['dayofweek'] = df['datetime'].dt.dayofweek
    dtime_df['hour'] = df['datetime'].dt.hour
    df.drop(columns='datetime', inplace=True)
    
    transformed_df = dtime_df.join(df)
    return transformed_df

testdf = datetransform(testdf)
testdf.head()

Unnamed: 0,year,month,dayofweek,hour,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011,1,3,0,1,0,1,1,10.66,11.365,56,26.0027
1,2011,1,3,1,1,0,1,1,10.66,13.635,56,0.0
2,2011,1,3,2,1,0,1,1,10.66,13.635,56,0.0
3,2011,1,3,3,1,0,1,1,10.66,12.88,56,11.0014
4,2011,1,3,4,1,0,1,1,10.66,12.88,56,11.0014


In [90]:
# converting binary* description to actual representation

months = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
weekdays = {0:'Mon', 1:'Tue', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
seasons = {1:'spring', 2:'summer', 3:'fall', 4:'winter'}
weather = {1:'clear', 2:'cloudy', 3:'light rain', 4:'snowy'}
testdf['month'] = testdf['month'].map(months)
testdf['dayofweek'] = testdf['dayofweek'].map(weekdays)
testdf['season'] = testdf['season'].map(seasons)
testdf['weather'] = testdf['weather'].map(weather)

In [91]:
# investigating outliers

testdf.isnull().any()

year          False
month         False
dayofweek     False
hour          False
season        False
holiday       False
workingday    False
weather       False
temp          False
atemp         False
humidity      False
windspeed     False
dtype: bool

In [92]:
# normalizing continuous features in the test data set

contcols = ['temp', 'atemp', 'humidity', 'windspeed']

def normalize(ttdf, trdf, cols):
    for feature in cols:
        minval, maxval = trdf[feature].min(), trdf[feature].max()
        ttdf[feature] = ( ttdf[feature] - minval ) / ( maxval - minval )

    return ttdf

testdf = normalize(testdf, traindf, contcols)
testdf.head()

Unnamed: 0,year,month,dayofweek,hour,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011,Jan,Thu,0,spring,0,1,clear,0.244898,0.237275,0.56,0.456213
1,2011,Jan,Thu,1,spring,0,1,clear,0.244898,0.288064,0.56,0.0
2,2011,Jan,Thu,2,spring,0,1,clear,0.244898,0.288064,0.56,0.0
3,2011,Jan,Thu,3,spring,0,1,clear,0.244898,0.271171,0.56,0.193018
4,2011,Jan,Thu,4,spring,0,1,clear,0.244898,0.271171,0.56,0.193018


In [93]:

catcols = ['year', 'month', 'dayofweek', 'hour', 'season', 'holiday', 'workingday', 'weather']
testdf[catcols] = testdf[catcols].astype('object')
testdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   year        6493 non-null   object 
 1   month       6493 non-null   object 
 2   dayofweek   6493 non-null   object 
 3   hour        6493 non-null   object 
 4   season      6493 non-null   object 
 5   holiday     6493 non-null   object 
 6   workingday  6493 non-null   object 
 7   weather     6493 non-null   object 
 8   temp        6493 non-null   float64
 9   atemp       6493 non-null   float64
 10  humidity    6493 non-null   float64
 11  windspeed   6493 non-null   float64
dtypes: float64(4), object(8)
memory usage: 608.8+ KB


In [94]:
testdf_select = testdf.copy()
testdf = pd.get_dummies(testdf, columns=catcols, drop_first=True)
testdf.head()

Unnamed: 0,temp,atemp,humidity,windspeed,year_2012,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,...,hour_22,hour_23,season_spring,season_summer,season_winter,holiday_1,workingday_1,weather_cloudy,weather_light rain,weather_snowy
0,0.244898,0.237275,0.56,0.456213,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
1,0.244898,0.288064,0.56,0.0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
2,0.244898,0.288064,0.56,0.0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
3,0.244898,0.271171,0.56,0.193018,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
4,0.244898,0.271171,0.56,0.193018,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0


In [95]:
testdf.to_csv('testall.csv', sep=r',', index=False)