In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

from scipy.stats import skew
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, RFE
from datetime import datetime
pd.options.mode.chained_assignment = None  # default='warn'

from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

# Not typical libraries
import calendar

In [2]:
class Preprocessing:
    def get_dummies(data, mult_Xlabels):
        return pd.get_dummies(data, columns=mult_Xlabels, drop_first=True)
    
    def get_label_encoder(data, mult_Xlabels):
        update_cols = data[mult_Xlabels].apply(preprocessing.LabelEncoder().fit_transform)
        data[mult_Xlabels] = update_cols
        return data
    
    def get_less_x_labels(data, threshold=10):
        categories = data.apply(lambda x: len(x.unique())).sort_values(ascending=True)
        values, labels = categories.values, categories.index
        values = [v for v in values if v <= treshold]
        labels = labels[:len(values)]
        return pd.DataFrame(values, index=labels)  
    
    def get_similar_cols(data, groupby):
        return data.filter(like=groupby, axis=1).columns.tolist()
    
    def add_similar_cols(data, groupby):
        cols = Preprocessing.get_similar_cols(data, groupby)
        return data[cols].sum(axis=1)
    
    def get_dtypes(data, verbose=False):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        numerics_cols = data.select_dtypes(include=numerics).columns
        object_cols = data.select_dtypes(include='object').columns
        if verbose:
            print('There are {0} numeric cols: {1}\nThere are {2} object cols: {3}\nThere are total cols: {4}'.format
                  (len(numerics_cols),numerics_cols, len(object_cols), object_cols, len(data.columns)))
        return [numerics_cols, object_cols]

In [3]:
training_set = pd.read_csv('./archive/train.csv')
testing_set = pd.read_csv('./archive/test.csv')

# One additional we could do is to concantate the training and testing data
all_data = pd.concat((training_set.loc[:,:'windspeed'],
                      testing_set.loc[:,:'windspeed']))

In [4]:
all_data.head(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0


In [5]:
all_data.shape

(17379, 9)

###  Missing Values

#### Notes:
We saw that there are not any missing values other the predictors in the data dataset.

In [8]:
# Separting the numeric and object column
num_cols, obj_cols = Preprocessing.get_dtypes(all_data, verbose=True)
numeric_df, object_df = all_data[num_cols], all_data[obj_cols]

There are 8 numeric cols: Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed'],
      dtype='object')
There are 1 object cols: Index(['datetime'], dtype='object')
There are total cols: 9


In [9]:
# We need to fix the training set to further analyze any insight
# There are no missing values
all_data[num_cols].apply(lambda x: sum(x.isnull())/len(x), axis=0).sort_values(ascending=False).head(4)

windspeed    0.0
humidity     0.0
atemp        0.0
temp         0.0
dtype: float64

In [10]:
# Lets look at the categorical dtypes for missing values
# There are no missing values
all_data[obj_cols].apply(lambda x: sum(x.isnull())/len(x), axis=0).sort_values(ascending=False)

datetime    0.0
dtype: float64

In [11]:
# We do not have to concat the numeric and columns df, like we would do for other projects
# Since there was no missing values

### Skewness 

In [12]:
# Analyzing if there is any skewness in the data. Meaning, we might have ppl with very large incomes (outliers)
# We can apply the log function to normalize it 
numeric_skew_df = numeric_df.apply(lambda x: skew(x.dropna()))
numeric_skew_df = numeric_skew_df[numeric_skew_df > .75]

# Applying the log function to data that that is skewed more than .75
numeric_df.loc[:, numeric_skew_df.index] = np.log1p(numeric_df.loc[:, numeric_skew_df.index])

In [13]:
all_data = pd.concat([numeric_df, object_df], axis=1)

### Feature Enginnering

In [14]:
# Usually, I'll update the columns with dummy variables. A more appr. tactic 
# would be to rename to the columns to its original meaning and create the dummy variables

# Conditional on time of day: morning=1, afternoon=2, evening=3, night=4
# Conditional on time of day: spring=1, summer=2, fall=3, winter=4
# Conditional on time of day: clear/few clouds=1, light snow/light rain=2, 
#                             light snow/light rain=3, heavy rain/ice pallets=4

def feature_enginnering(data):
    data['datetime'] = pd.to_datetime(data['datetime'])
    data['year'] = data['datetime'].dt.year
    data['year'] = data['year'].astype('object')
    
    data['month'] = data['datetime'].dt.month    
    data['month'] = data['month'].apply(lambda x: calendar.month_abbr[x])
    
    data['day'] = data['datetime'].dt.day
    data['hour'] = data['datetime'].dt.hour
    data['minute'] = data['datetime'].dt.minute
    
    data['dayofweek'] = data['datetime'].dt.dayofweek
    dayofweek_name = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
    data['dayofweek'] = data['dayofweek'].map(dayofweek_name)
    
    data['weekofyear'] = data['datetime'].dt.weekofyear
    data['weekend'] = data['dayofweek'].map(lambda x: "Yes" if int(x in [5,6]) else "No")
    
    # Creating new columns that labels different time of days
    time_of_day_conds = [
    ((data['hour'] >=  5) & (data['hour'] < 12)),
    ((data['hour'] >=  12) & (data['hour'] < 17)),
    ((data['hour'] >=  17) & (data['hour'] < 21))]
    time_of_day_choices = ["morning", "afternoon", "evening"]
    data['time_of_day'] = np.select(time_of_day_conds, time_of_day_choices, default="night")
    
    # Creating a T/F boolean variable to check whether if it workday
    data['rush_workday'] = data['dayofweek'].map(lambda x: "Yes" if int(x in [0,1,2,3,4]) else "No")
    
    # Creating a T/F boolean variable to check whether it is rush_hour on a workday
    rush_hours = [8, 9, 10, 16, 17, 18]
    data['rush_hour'] = data.apply(lambda x: "Yes" if x['rush_workday'] == 1 
                                   and x['hour'] in rush_hours else "No", axis=1)
    
    # Modifying the 'season' column
    seasons_to_rename = {1: 'spring', 2: 'summer', 3: 'fall', 4: 'winter'}
    data['season'] = data['season'].map(seasons_to_rename)
    
    # Modifying the 'weather' column
    weather_to_rename = {1: 'clear/few clouds', 2: 'mist/cloudy', 
                         3: 'light snow/light rain', 4: 'heavy rain/ice pallets'}
    data['weather'] = data['weather'].map(weather_to_rename)
    
    
    # Modying the holiday/working day
    data['holiday'] = data['holiday'].map({1: "Yes", 0: "No"})
    data['workingday'] = data['workingday'].map({1: "Yes", 0: "No"})
    
    return data


In [15]:
all_data = feature_enginnering(all_data)

In [16]:
# After the modified data, we should view which columns are considered numerical/categorical
num_cols, obj_cols = Preprocessing.get_dtypes(all_data, verbose=True)
numeric_df, object_df = all_data[num_cols], all_data[obj_cols]

There are 8 numeric cols: Index(['temp', 'atemp', 'humidity', 'windspeed', 'day', 'hour', 'minute',
       'weekofyear'],
      dtype='object')
There are 11 object cols: Index(['season', 'holiday', 'workingday', 'weather', 'year', 'month',
       'dayofweek', 'weekend', 'time_of_day', 'rush_workday', 'rush_hour'],
      dtype='object')
There are total cols: 20


In [17]:
# Creating the dummy variables for the categorical columns
obj_dummy_df = pd.get_dummies(object_df, drop_first=True)

# Concat the categorical with dummies and numerical valus
all_data = pd.concat([numeric_df, obj_dummy_df], axis=1)
all_data_cleaned = all_data.copy()

### Feature Scaling

In [18]:
# We will create a new separate df that standarizes the data

def get_standarized_data(data, x_labels):
    features = data[x_labels]
    features = preprocessing.StandardScaler().fit_transform(features.values)
    data[x_labels] = features
    return data

In [19]:
all_data_std = get_standarized_data(all_data, num_cols)
all_data_std.head(5)

Unnamed: 0,temp,atemp,humidity,windspeed,day,hour,minute,weekofyear,season_spring,season_summer,...,month_Sep,dayofweek_Monday,dayofweek_Saturday,dayofweek_Sunday,dayofweek_Thursday,dayofweek_Tuesday,dayofweek_Wednesday,time_of_day_evening,time_of_day_morning,time_of_day_night
0,-1.334648,-1.093281,0.947372,-1.553889,-1.670635,-1.670004,0.0,1.691336,1,0,...,0,0,1,0,0,0,0,0,0,1
1,-1.438516,-1.181732,0.895539,-1.553889,-1.670635,-1.525374,0.0,1.691336,1,0,...,0,0,1,0,0,0,0,0,0,1
2,-1.438516,-1.181732,0.895539,-1.553889,-1.670635,-1.380744,0.0,1.691336,1,0,...,0,0,1,0,0,0,0,0,0,1
3,-1.334648,-1.093281,0.63637,-1.553889,-1.670635,-1.236115,0.0,1.691336,1,0,...,0,0,1,0,0,0,0,0,0,1
4,-1.334648,-1.093281,0.63637,-1.553889,-1.670635,-1.091485,0.0,1.691336,1,0,...,0,0,1,0,0,0,0,0,0,1


In [20]:
all_data_cleaned.head(5)

Unnamed: 0,temp,atemp,humidity,windspeed,day,hour,minute,weekofyear,season_spring,season_summer,...,month_Sep,dayofweek_Monday,dayofweek_Saturday,dayofweek_Sunday,dayofweek_Thursday,dayofweek_Tuesday,dayofweek_Wednesday,time_of_day_evening,time_of_day_morning,time_of_day_night
0,9.84,14.395,81,0.0,1,0,0,52,1,0,...,0,0,1,0,0,0,0,0,0,1
1,9.02,13.635,80,0.0,1,1,0,52,1,0,...,0,0,1,0,0,0,0,0,0,1
2,9.02,13.635,80,0.0,1,2,0,52,1,0,...,0,0,1,0,0,0,0,0,0,1
3,9.84,14.395,75,0.0,1,3,0,52,1,0,...,0,0,1,0,0,0,0,0,0,1
4,9.84,14.395,75,0.0,1,4,0,52,1,0,...,0,0,1,0,0,0,0,0,0,1


In [21]:
# Double checking the sizes of the all_data
all_data_cleaned.shape, training_set.shape, testing_set.shape

((17379, 33), (10886, 12), (6493, 9))

In [26]:
training_set.head(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [35]:
# Saving predictors (stangenloy enough, using np.log worsen performance
y_log_count = np.log(training_set['count']+1)
y_log_casual = np.log(training_set['casual']+1)
y_log_registered = np.log(training_set['registered']+1)

y_count = training_set['count']
y_casual = training_set['casual']
y_registered = training_set['registered']

In [44]:
# Saving the sets for future use
y_predictors = [y_log_count, y_log_casual, y_log_registered]

# Merging the training set with the predictors
saved_trainset = all_data_cleaned.iloc[:len(training_set)]
saved_trainset = saved_trainset.join(y_predictors)

# Merging the training set with the predictors (training set is standardized)
saved_trainset_std = all_data_std.iloc[:len(training_set)]
saved_trainset_std = saved_trainset_std.join(y_predictors)

# Creating the testing set that will be used
saved_testset = all_data_cleaned.iloc[len(training_set):]
saved_testset = saved_testset.join(testing_set['datetime'])
saved_testset_std = all_data_std.iloc[len(training_set):]
saved_testset_std = saved_testset_std.join(testing_set['datetime'])

# Saving the training set to pickles
saved_trainset.to_pickle("./pickles/saved_trainset.pkl")
saved_trainset_std.to_pickle("./pickles/saved_trainset_std.pkl")
saved_testset.to_pickle("./pickles/saved_testset.pkl")
saved_testset_std.to_pickle("./pickles/saved_testset_std.pkl")


In [45]:
saved_trainset_std.head(5)

Unnamed: 0,temp,atemp,humidity,windspeed,day,hour,minute,weekofyear,season_spring,season_summer,...,dayofweek_Sunday,dayofweek_Thursday,dayofweek_Tuesday,dayofweek_Wednesday,time_of_day_evening,time_of_day_morning,time_of_day_night,count,casual,registered
0,-1.334648,-1.093281,0.947372,-1.553889,-1.670635,-1.670004,0.0,1.691336,1,0,...,0,0,0,0,0,0,1,2.833213,1.386294,2.639057
1,-1.438516,-1.181732,0.895539,-1.553889,-1.670635,-1.525374,0.0,1.691336,1,0,...,0,0,0,0,0,0,1,3.713572,2.197225,3.496508
2,-1.438516,-1.181732,0.895539,-1.553889,-1.670635,-1.380744,0.0,1.691336,1,0,...,0,0,0,0,0,0,1,3.496508,1.791759,3.332205
3,-1.334648,-1.093281,0.63637,-1.553889,-1.670635,-1.236115,0.0,1.691336,1,0,...,0,0,0,0,0,0,1,2.639057,1.386294,2.397895
4,-1.334648,-1.093281,0.63637,-1.553889,-1.670635,-1.091485,0.0,1.691336,1,0,...,0,0,0,0,0,0,1,0.693147,0.0,0.693147
