In [264]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Description

- datetime - hourly date + timestamp 
-season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
-holiday - whether the day is considered a holiday
-workingday - whether the day is neither a weekend nor holiday
-weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 
           2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
           3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
           4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
-temp - temperature in Celsius
-atemp - "feels like" temperature in Celsius
-humidity - relative humidity
-windspeed - wind speed
-casual - number of non-registered user rentals initiated
-registered - number of registered user rentals initiated
-count - number of total rentals

#### <u>The problem related to regression between classification and regression <u>
=> Because it is a problem to predict the amount of bicycle rental.

In [265]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [266]:
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [10,5]

In [267]:
train_df = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
train_df

In [268]:
test_df = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')
test_df

## 2. **Data Infromation**

In [269]:
train_df.info()
print('-'*50)
test_df.info()

>  -> Need to predict 'casual', 'registered', 'count' \
      Extract the factors that affect 'casual', 'registered', 'count' the most

In [270]:
train_df.dtypes.value_counts().plot(kind='bar')
plt.xticks(rotation=0)
plt.show()

Does not exist NULL, and there are 8 int, 3 float, and 1 object types. \
Also, it is worth noting that the datatime type is set to object, so the preprocessing process with datatime is necessary.

In [271]:
train_df.describe()

In [272]:
train_df.isnull().sum()

In [273]:
import missingno as msno
msno.matrix(train_df)

> There is not exist NULL

## 3. Preprocessing 
datetime's type change to datetime (originaly was object type)

In [274]:
train_df['datetime'] = pd.to_datetime(train_df['datetime'], format='%Y-%m-%d')
train_df.info()

In [275]:
train_df['year'] = train_df['datetime'].dt.year
train_df['month'] = train_df['datetime'].dt.month
train_df['day'] = train_df['datetime'].dt.day
train_df['hour'] = train_df['datetime'].dt.hour
train_df['minute'] = train_df['datetime'].dt.minute
train_df['second'] = train_df['datetime'].dt.second
train_df['weekday'] = train_df['datetime'].dt.weekday
train_df.shape

In [276]:
train_df.head()

#weekday : 0-Mon, 1-Tue, 2-Wed, 3-Thu, 4-Fri, 5-Sat, 6-Sun

## 4. EDA (Visualize the amount of rent per time)

In [277]:
plt.figure(figsize=(18,8))

plt.subplot(241)
sns.barplot(data=train_df, x='year', y='count')
plt.title('Rent of Year')

plt.subplot(242)
sns.barplot(data=train_df, x='month', y='count')
plt.title('Rent of Month')

plt.subplot(243)
sns.barplot(data=train_df, x='day', y='count')
plt.title('Rent of Day')

plt.subplot(244)
sns.barplot(data=train_df, x='hour', y='count')
plt.title('Rent of Hour')

plt.subplot(245)
sns.barplot(data=train_df, x='minute', y='count')
plt.title('Rent of Minute')

plt.subplot(246)
sns.barplot(data=train_df, x='second', y='count')
plt.title('Rent of Second')

plt.subplot(247)
sns.barplot(data=train_df, x='weekday', y='count')
plt.title('Rent of Weekday')

plt.subplots_adjust(hspace=.4)
plt.show()

* Year : The utilization rate increased 1.5 times in 2012 from 2011. \
(In 2011, users may have promoted and recommended it to people around them, and the utilization rate may have increased in 2012.)
* month : The usage rate is higher in warm seasons than in cold days.
* day : The data only exists until 19th. so can not using it as a feature. just only refer to it
* minute : There is no values, only have zero data
* second : There is no values, only have zero data
* weekday : It has a relatively even distribution from Monday to Sunday.

In [278]:
# Remove 'minute' & 'second'
train_df.drop(['day', 'minute', 'second'], axis=1, inplace=True)
train_df.head()

In [279]:
plt.figure(figsize=(18,8))

plt.subplot(231)
sns.boxplot(data=train_df, y='count', orient='v')   # orient='v' -> make a box vertically
plt.title('Amount of Rentals')

plt.subplot(232)
sns.boxplot(data=train_df, y='count', x='season', orient='v')   
plt.title('Seasonal Rentals')

plt.subplot(233)
sns.boxplot(data=train_df, y='count', x='hour', orient='v')   
plt.title('Rentals by Hours')

plt.subplot(234)
sns.boxplot(data=train_df, y='count', x='weekday', orient='v')   
plt.title('Rentals by Weekdays')

plt.subplot(235)
sns.boxplot(data=train_df, y='count', x='workingday', orient='v')   
plt.title('Rentals by Workingday')

plt.subplots_adjust(hspace=.4)
plt.show()

* Amount of rentals : concentrated in a specific section.
* Seasonal rentals : autumn > summer > winter > spring, the rental volume is high.
* Rentals by workingday : similar, but the amount of rentals on holidays is slightly higher than on working days.

In [280]:
plt.figure(figsize=(18,25))

plt.subplot(511)
sns.pointplot(data=train_df, x='hour', y='count')
plt.title('Rentals by Hours')

plt.subplot(512)
sns.pointplot(data=train_df, x='hour', y='count', hue='workingday')
plt.title('Rentals by Hours on workingdays')

plt.subplot(513)
sns.pointplot(data=train_df, x='hour', y='count', hue='weekday')
plt.title('Rentals by Hours on days')

plt.subplot(514)
sns.pointplot(data=train_df, x='hour', y='count', hue='weather')
plt.title('Rentals by Hours on weather')

plt.subplot(515)
sns.pointplot(data=train_df, x='hour', y='count', hue='season')
plt.title('Rentals by Hours on season')

plt.subplots_adjust(hspace=.4)
plt.show()

### 4-1. correlation heatmap (상관관계 히트맵)

In [281]:
train_df.columns

In [282]:
corrMatt = train_df.iloc[:,3:11]
corrMatt = corrMatt.corr()
print(corrMatt)

mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False

In [283]:
plt.figure(figsize=(20,10))
sns.heatmap(corrMatt, mask=mask, annot=True, square=True, vmax=.8)

* Temperature, humidity, and wind speed are hardly related.

* Most have relation with rental rate is registered, but the test data does not have this value.

* Atemp and temp have a high correlation of 0.98, but they may not be suitable for use as features due to temp and atemp.

* 온도, 습도, 풍속은 연관관계가 거의 없다고 볼 수 있다.

* 대여량과 가장 연관이 높은 건 registered로 등록된 대여자가 많지만, test 데이터에는 이 값이 없다.

* atemp와 temp는 0.98로 상관관계가 높지만 온도와 체감온도로 피처를 사용하기에 적합하지 않을 수 있다.

### 4-2. Remove Outliers

* method 1 => IQR = Q3 - Q1
                  Outliers = Q1 - IQR*1.5 / Q3 + IQR*1.5
* method 2 => outliers = avg +- 3*Standard deviation.

In [284]:
sns.boxplot(data=train_df, y='count', orient='v')
plt.title('Amount of Rentals(original)')

In [285]:
count_Q1 = np.percentile(train_df['count'], 25)
count_Q3 = np.percentile(train_df['count'], 75)

In [286]:
count_IQR = count_Q3 - count_Q1
count_IQR

In [287]:
# method 1
train_clean = train_df[(train_df['count'] >= (count_Q1-(1.5*count_IQR))) & (train_df['count'] <= (count_Q3+(1.5*count_IQR)))]

In [288]:
sns.boxplot(data=train_clean, y='count', orient='v')
plt.title('Amount of Rentals(remove outliers by method1)')

In [289]:
# method 2
train_clean_2 = (train_clean[np.abs(train_clean['count'] - train_clean['count'].mean() <=
                  (3*train_clean['count'].std()))])

In [290]:
sns.boxplot(data=train_clean_2, y='count', orient='v')
plt.title('Amount of Rentals(remove outliers by method2)')

* The outliers have significantly decreased compared to the previous one
* 이상치가 눈에띄게 줄어든 것을 확인 할 수 있다.

### 4-3. Data normalization.

* "Datetime" type data cannot be used in linear regression, so convert it to a numeric type.
* "Datetim" 타입은 선형회귀에서 사용할 수 없으므로 숫자형 타입으로의 변환이 필요하다.

In [291]:
def to_int(datetime):
    return 10000*datetime.year + 100*datetime.month + datetime.day

datetime_int = train_clean['datetime'].apply(lambda x: to_int(x))
datetime_int

In [292]:
train_clean['datetime'] = pd.Series(datetime_int)
train_clean.info()

In [293]:
train_clean

In [294]:
plt.figure(figsize=(17,7))

plt.subplot(131)
sns.distplot(train_clean['count'])

plt.subplot(132)
sns.distplot(train_clean['count'], hist=False)

plt.subplot(133)
sns.distplot(train_clean['count'], kde=False)

plt.subplots_adjust(wspace=.3)
plt.show()

In [295]:
sns.distplot(np.log1p(train_clean)['count'])
plt.title('Amount of Rentals')
plt.show()

* To make it into a normal distribution form, we log data to transform the data values, and although it does not follow a completely altered normal distribution form, it is closer to a slightly normal distribution form than before.

* 정규분포 형태로 만들기 위해 데이터에 로그를 씌워 데이터 값을 변형했고 완변한 정규분포 형태를 따르지는 않지만 이전보다 약간의 정규분포 형태에 가까워졌다


## 5. Modeling

In [296]:
train_clean.columns

In [297]:
test_df.columns

* Remove two features (casual, registered) that are not in test_df and match the column using date data to test_df.
* test_df에 없는 두 feature(casual, registered)를 제거하고 test_df에도 날짜데이터를 활용한 컬럼을 맞춰준다

In [298]:
test_df['datetime'] = pd.to_datetime(test_df['datetime'], format='%Y-%m-%d')
test_df.info()

In [299]:
test_df['year'] = test_df['datetime'].dt.year
test_df['month'] = test_df['datetime'].dt.month
test_df['hour'] = test_df['datetime'].dt.hour
test_df['weekday'] = test_df['datetime'].dt.weekday
test_df.shape

In [300]:
train_clean.columns

In [301]:
train_clean.drop(columns=['registered', 'casual'], axis=1, inplace=True)
train_clean.shape

In [302]:
test_df.shape

In [303]:
#for linear regression, trasform into numeric type.

test_df['datetime'] = pd.Series(test_df['datetime'].apply(lambda x : to_int(x)))
test_df.info()

* Except for the target variable 'count', the number of features and form of training data and test data matched.
* 타켓 변수인 'count'를 제외하고 훈련데이터와 테스트데이터의 feature 수와 형태가 일치한다.

### 5-1. Split Data (데이터셋 나누기)

In [304]:
# target과 features 구분
train_x = train_clean.drop(columns='count', axis=1)   # 문제
train_y = train_clean['count']                        # 정답
test_x = test_df

In [305]:
from sklearn.linear_model import LinearRegression

#선형회귀모델 선언
linear_model = LinearRegression()

#모델 학습
linear_model.fit(train_x, train_y)

#예측
linear_model.predict(test_x)

#평가
linear_model.score(train_x, train_y)

In [312]:
train_clean.columns

In [316]:
from statsmodels.formula.api import ols

ols('count ~ datetime + season + holiday + workingday + weather + temp + atemp + humidity + windspeed + year + month + hour + weekday',
    data=train_clean).fit().summary()

In [319]:
ols('count ~ atemp + humidity + windspeed + hour', data=train_clean).fit().summary()