In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore') 

In [2]:
# loading datasets

testdf = pd.read_csv('./dataset/test.csv', sep=r',', parse_dates=['datetime'])
traindf = pd.read_csv('./dataset/train.csv', sep=r',', parse_dates=['datetime'])
testdf.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


##### Filtering features to keep

In [3]:
# datetime transformation

def datetransform(df):
    dtime_df = pd.DataFrame()
    
    dtime_df['year'] = df['datetime'].dt.year
    dtime_df['hour'] = df['datetime'].dt.hour
    df.drop(columns='datetime', inplace=True)
    
    transformed_df = dtime_df.join(df)
    return transformed_df

testdf = datetransform(testdf)
testdf.head()

Unnamed: 0,year,hour,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011,0,1,0,1,1,10.66,11.365,56,26.0027
1,2011,1,1,0,1,1,10.66,13.635,56,0.0
2,2011,2,1,0,1,1,10.66,13.635,56,0.0
3,2011,3,1,0,1,1,10.66,12.88,56,11.0014
4,2011,4,1,0,1,1,10.66,12.88,56,11.0014


In [4]:
keep = ['year', 'hour', 'season', 'workingday', 'weather', 'atemp', 'humidity']
testdf = testdf[keep]
testdf.head()

Unnamed: 0,year,hour,season,workingday,weather,atemp,humidity
0,2011,0,1,1,1,11.365,56
1,2011,1,1,1,1,13.635,56
2,2011,2,1,1,1,13.635,56
3,2011,3,1,1,1,12.88,56
4,2011,4,1,1,1,12.88,56


In [5]:
# converting binary* description to actual representation

seasons = {1:'spring', 2:'summer', 3:'fall', 4:'winter'}
weather = {1:'clear', 2:'cloudy', 3:'light rain', 4:'snowy'}
testdf['season'] = testdf['season'].map(seasons)
testdf['weather'] = testdf['weather'].map(weather)

In [6]:
# investigating outliers

testdf.isnull().any()

year          False
hour          False
season        False
workingday    False
weather       False
atemp         False
humidity      False
dtype: bool

In [8]:
# normalizing continuous features in the test data set

contcols = ['atemp', 'humidity']

def normalize(ttdf, trdf, cols):
    for feature in cols:
        minval, maxval = trdf[feature].min(), trdf[feature].max()
        ttdf[feature] = ( ttdf[feature] - minval ) / ( maxval - minval )

    return ttdf

testdf = normalize(testdf, traindf, contcols)
testdf.head()

Unnamed: 0,year,hour,season,workingday,weather,atemp,humidity
0,2011,0,spring,1,clear,0.237275,0.56
1,2011,1,spring,1,clear,0.288064,0.56
2,2011,2,spring,1,clear,0.288064,0.56
3,2011,3,spring,1,clear,0.271171,0.56
4,2011,4,spring,1,clear,0.271171,0.56


In [10]:
catcols = ['year', 'hour', 'season', 'weather']
testdf[catcols] = testdf[catcols].astype('object')
testdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   year        6493 non-null   object 
 1   hour        6493 non-null   object 
 2   season      6493 non-null   object 
 3   workingday  6493 non-null   int64  
 4   weather     6493 non-null   object 
 5   atemp       6493 non-null   float64
 6   humidity    6493 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 355.2+ KB


In [11]:
testdf_select = testdf.copy()
testdf = pd.get_dummies(testdf, columns=catcols, drop_first=True)
testdf.head()

Unnamed: 0,workingday,atemp,humidity,year_2012,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,...,hour_20,hour_21,hour_22,hour_23,season_spring,season_summer,season_winter,weather_cloudy,weather_light rain,weather_snowy
0,1,0.237275,0.56,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0.288064,0.56,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0.288064,0.56,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1,0.271171,0.56,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1,0.271171,0.56,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [12]:
testdf.to_csv('test_transformed.csv', sep=r',', index=False)