# Feature Engineering Exercise (Core)
- Zach Hanson

## Importing Libraries and Data

### Libraries

In [1]:
#Pandas for exercise
import pandas as pd

### Data

In [2]:
#import data
df = pd.read_csv('bikeshare_train - bikeshare_train.csv')

#Previewing Data
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


## Data Cleaning

### Unnecessary Columns

In [3]:
#Dropping Casual and Registered columns
df = df.drop(columns=['casual', 'registered'])

#Making sure they were dropped
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,1


- Both columns dropped successfully 
- Columns dropped because they were redundant with "Count" column

In [4]:
#Checking Datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   count       10886 non-null  int64  
dtypes: float64(3), int64(6), object(1)
memory usage: 850.6+ KB


- Will need to convert datetime column to datetime data type later

### Unnecessary Rows


#### Duplicates

In [5]:
#Checking for duplicates
df.duplicated().sum()

0

- No duplicated rows

#### Missing Values

In [6]:
#Checking for missing values
df.isna().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
count         0
dtype: int64

- No missing data

## Transform Date-Time

In [7]:
#Converting datetime column to datetime datatype
df['datetime'] = pd.to_datetime(df['datetime'])

#Checking to see if it converted
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   count       10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(6)
memory usage: 850.6 KB


- Converted successfully 

### Name of Month

In [8]:
df['month_name'] = df['datetime'].dt.month_name()

### Name of Day of Week

In [9]:
df['day_name'] = df['datetime'].dt.day_name()

### Hour of Day

In [10]:
df['hour'] = df['datetime'].dt.hour

### Dropping Old Columns

In [11]:
#Checking current dataframe first
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month_name,day_name,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,13,January,Saturday,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,1,January,Saturday,4


In [12]:
#Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   count       10886 non-null  int64         
 10  month_name  10886 non-null  object        
 11  day_name    10886 non-null  object        
 12  hour        10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(7), object(2)
memory usage: 1.1+ MB


- All our date-times seem to have been added
- Hour column needs to be updated to object datatype

In [13]:
df['hour'] = df['hour'].astype('object')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   count       10886 non-null  int64         
 10  month_name  10886 non-null  object        
 11  day_name    10886 non-null  object        
 12  hour        10886 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(6), object(3)
memory usage: 1.1+ MB


- 'hour' column updated to object datatype

In [14]:
#Dropping unnecessary columns
df = df.drop(columns = ['datetime', 'season'])

#Checking to make sure these were dropped
df.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month_name,day_name,hour
0,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2
3,0,0,1,9.84,14.395,75,0.0,13,January,Saturday,3
4,0,0,1,9.84,14.395,75,0.0,1,January,Saturday,4


- Dropped successfully

## Changing Temperature Columns

### Converting temp and atemp to Fahrenheit

In [15]:
#temp
df['temp'] = df['temp'].apply(lambda x: 9/5 * x + 32)

#atemp
df['atemp'] = df['atemp'].apply(lambda x: 9/5 * x + 32)

In [16]:
df.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month_name,day_name,hour
0,0,0,1,49.712,57.911,81,0.0,16,January,Saturday,0
1,0,0,1,48.236,56.543,80,0.0,40,January,Saturday,1
2,0,0,1,48.236,56.543,80,0.0,32,January,Saturday,2
3,0,0,1,49.712,57.911,75,0.0,13,January,Saturday,3
4,0,0,1,49.712,57.911,75,0.0,1,January,Saturday,4


- Looks to have converted properly

In [17]:
#New column for difference between temp and atemp
df['temp_variance'] = df['atemp'] - df['temp']
df.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month_name,day_name,hour,temp_variance
0,0,0,1,49.712,57.911,81,0.0,16,January,Saturday,0,8.199
1,0,0,1,48.236,56.543,80,0.0,40,January,Saturday,1,8.307
2,0,0,1,48.236,56.543,80,0.0,32,January,Saturday,2,8.307
3,0,0,1,49.712,57.911,75,0.0,13,January,Saturday,3,8.199
4,0,0,1,49.712,57.911,75,0.0,1,January,Saturday,4,8.199


- Looks to have been added successfully

In [18]:
#Dropping atemp column
df = df.drop(columns = 'atemp')
df.head()

Unnamed: 0,holiday,workingday,weather,temp,humidity,windspeed,count,month_name,day_name,hour,temp_variance
0,0,0,1,49.712,81,0.0,16,January,Saturday,0,8.199
1,0,0,1,48.236,80,0.0,40,January,Saturday,1,8.307
2,0,0,1,48.236,80,0.0,32,January,Saturday,2,8.307
3,0,0,1,49.712,75,0.0,13,January,Saturday,3,8.199
4,0,0,1,49.712,75,0.0,1,January,Saturday,4,8.199
