# Creation of the Training and Test Data Sets (WML Project)

Let's import all necessary **libraries**!

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import glob

From all the available NYC CitiBike Data (105 months, from June 2013 till February 2022), 80% will be kept for training and 20% for test.

1. Building the first colum and rows (84 months) for the **training set** (21 months) for the **test set** in two different data frames with the right date format.

In [2]:
dates_trn = pd.date_range(start = "2013-06-01", end = "2020-05-01", freq='MS')
df_training = pd.DataFrame({'dates':dates_trn})
df_training.head()

Unnamed: 0,dates
0,2013-06-01
1,2013-07-01
2,2013-08-01
3,2013-09-01
4,2013-10-01


In [3]:
dates_test = pd.date_range(start = "2020-06-01", end = "2022-02-01", freq='MS')
df_test = pd.DataFrame({'dates':dates_test})
df_test.head()

Unnamed: 0,dates
0,2020-06-01
1,2020-07-01
2,2020-08-01
3,2020-09-01
4,2020-10-01


2. Incorporating the **NYC CitiBike Data** (Number of Trips per month) to the corresponding data frames (training and test). Source: *https://s3.amazonaws.com/tripdata/index.html*

In [4]:
path_trn = "data_bikes/training/"
all_files = glob.glob(path_trn + "*.csv")
all_files.sort()

num_trips = []
for filename in all_files:
    df_01 = pd.read_csv(filename)
    df_02 = len(df_01)
    num_trips.append(df_02)

df_training["trips"] = num_trips  
df_training

Unnamed: 0,dates,trips
0,2013-06-01,577703
1,2013-07-01,843416
2,2013-08-01,1001958
3,2013-09-01,1034359
4,2013-10-01,1037712
...,...,...
79,2020-01-01,1240596
80,2020-02-01,1146830
81,2020-03-01,1068457
82,2020-04-01,682762


In [6]:
path_tst = "data_bikes/test/"
all_files_02 = glob.glob(path_tst + "*.csv")
all_files_02.sort()

num_trips_tst = []
for filename_tst in all_files_02:
    df_tst_01 = pd.read_csv(filename_tst)
    df_tst_02 = len(df_tst_01)
    num_trips_tst.append(df_tst_02)

df_test["trips"] = num_trips_tst  
df_test

Unnamed: 0,dates,trips
0,2020-06-01,1882273
1,2020-07-01,2105808
2,2020-08-01,2329514
3,2020-09-01,2488225
4,2020-10-01,2248869
5,2020-11-01,1736704
6,2020-12-01,1088929
7,2021-01-01,1095346
8,2021-02-01,649983
9,2021-03-01,1531094


4. Reading and editing **NYC monthly historical weather data** (precipitation and temperature).
Source: *https://www.weather.gov/wrh/Climate?wfo=okx*

In [7]:
rain_raw = pd.read_csv("weather_data/monthly_rainfall_average_nyc.csv")
rain = rain_raw.transpose()
rain.columns = rain.iloc[0]
rain = rain.reset_index(drop=True)
rain = rain.drop(0)

temp_raw = pd.read_csv("weather_data/monthly_average_temp_nyc.csv")
temp = temp_raw.transpose()
temp.columns = temp.iloc[0]
temp = temp.reset_index(drop=True)
temp = temp.drop(0)
rain

Year,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0,2022.0
1,2.76,2.79,5.23,4.41,4.83,2.18,3.58,1.93,2.31,4.29
2,4.25,5.48,2.04,4.4,2.48,5.83,3.14,2.54,5.13,3.23
3,2.9,3.67,4.72,1.17,5.25,5.17,3.87,3.78,3.41,0.0
4,1.31,7.85,2.08,1.61,3.84,5.78,4.55,4.49,2.69,0.0
5,8.0,4.37,1.86,3.75,6.38,3.53,6.82,1.65,4.36,0.0
6,10.1,4.26,4.79,2.6,4.76,3.11,5.46,1.76,2.62,0.0
7,2.84,5.59,3.98,7.02,4.19,7.45,5.77,6.58,11.09,0.0
8,2.85,2.25,2.35,1.97,3.34,8.59,3.7,5.03,10.32,0.0
9,2.95,1.21,3.28,2.79,2.0,6.19,0.95,3.94,10.03,0.0
10,0.36,5.77,3.91,4.15,4.18,3.59,6.15,5.05,5.26,0.0


In [8]:
#export inverted
rain.to_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/weather_data/rain_m.csv', index = False, header = True)
temp.to_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/weather_data/temp_m.csv', index = False, header = True)

5. **Adding** the weather data to the corresponding data frames and **exporting** it.

In [9]:
rain_training = pd.read_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/weather_data/rain_training.csv')
rain_test = pd.read_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/weather_data/rain_test.csv')
temp_training = pd.read_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/weather_data/temp_traing.csv')
temp_test = pd.read_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/weather_data/temp_test.csv')

df_training['Rain'] = rain_training['month_average_training']
df_training['Temperature'] = temp_training['monthly_av_training']
df_test['Rain'] = rain_test['month_average_test']
df_test['Temperature'] = temp_test['monthly_av_test']

df_training.to_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/training.csv', index = False, header = True)
df_test.to_csv('/Users/alexherrera/Desktop/JupiterNotebook/Bicing_Prediction_WML/test.csv', index = False, header = True)