# Data Preprocessing API Visual Crossing

#### Dưới đây là những tỉnh thành đại diện cho các vùng miền trên cả nước Việt Nam:
- Sơn La – Đại diện cho Tây Bắc Bộ
- Lạng Sơn – Đại diện cho Đông Bắc Bộ
- Hà Nội – Đại diện cho Đồng bằng sông Hồng
- Nghệ An – Đại diện cho Bắc Trung Bộ
- Đà Nẵng – Đại diện cho Nam Trung Bộ
- Lâm Đồng – Đại diện cho Tây Nguyên
- Thành phố Hồ Chí Minh – Đại diện cho Đông Nam Bộ
- Bến Tre – Đại diện cho Vùng đồng bằng sông Cửu Long

#### Ý nghĩa của việc tiền xử lý dữ liệu:
Việc tiền xử lý các file CSV từ các tỉnh thành đại diện không chỉ giúp chuẩn hóa dữ liệu mà còn hỗ trợ các bước tiếp theo trong phân tích và khám phá dữ liệu trở nên thuận tiện hơn.

### Import Libraries
Bước đầu tiên trong quy trình tiền xử lý dữ liệu là nhập các thư viện cần thiết. Numpy và Pandas là hai thư viện Python mạnh mẽ và quan trọng được sử dụng trong lĩnh vực khoa học dữ liệu.

In [1]:
import numpy as np
import pandas as pd

### Import Data
Tiếp theo, chúng ta cần nhập dữ liệu. Vì dữ liệu của chúng ta được lưu dưới dạng file CSV (Comma Separated Value - giá trị phân cách bằng dấu phẩy), nên chúng ta sẽ sử dụng hàm read_csv do Pandas cung cấp. Hàm này chỉ cần tên file của chúng ta làm tham số đầu vào.

In [2]:
raw_df = pd.read_csv('./../data/input_data/HaNoi.csv')
print(raw_df.columns)

Index(['Datetime', 'DatetimeEpoch', 'Tempmax', 'Tempmin', 'Temp', 'Dew',
       'Feelslike', 'Precip', 'Precipprob', 'Precipcover', 'Preciptype',
       'Snow', 'Snowdepth', 'Windspeed', 'Windgust', 'Winddir', 'Visibility',
       'Cloudcover', 'Humidity', 'Pressure', 'Solarradiation', 'Solarenergy',
       'Uvindex', 'Sunrise', 'Sunset', 'Moonphase', 'Icon', 'Conditions',
       'Description', 'Stations', 'Address'],
      dtype='object')


In [3]:
raw_df.head(5)

Unnamed: 0,Datetime,DatetimeEpoch,Tempmax,Tempmin,Temp,Dew,Feelslike,Precip,Precipprob,Precipcover,...,Solarenergy,Uvindex,Sunrise,Sunset,Moonphase,Icon,Conditions,Description,Stations,Address
0,2019-01-01,1546275600,13.0,11.0,11.8,5.3,11.8,0.0,0.0,0.0,...,4.3,2.0,06:33:36,17:26:14,0.83,partly-cloudy-day,Partially cloudy,Partly cloudy throughout the day.,"['48820099999', '48823099999', '48825099999', ...",Hanoi
1,2019-01-02,1546362000,17.0,11.5,14.1,7.0,14.1,0.1,100.0,4.17,...,3.8,2.0,06:33:55,17:26:51,0.87,rain,"Rain, Partially cloudy",Partly cloudy throughout the day with late aft...,"['48820099999', '48823099999', '48825099999', ...",Hanoi
2,2019-01-03,1546448400,18.1,13.0,15.3,9.0,15.3,0.01,100.0,4.17,...,3.8,2.0,06:34:13,17:27:29,0.9,rain,"Rain, Partially cloudy",Partly cloudy throughout the day with afternoo...,"['48820099999', '48823099999', '48825099999', ...",Hanoi
3,2019-01-04,1546534800,19.0,13.0,16.3,12.4,16.3,0.878,100.0,4.17,...,3.9,2.0,06:34:30,17:28:08,0.94,rain,"Rain, Partially cloudy",Partly cloudy throughout the day with morning ...,"['48820099999', '48823099999', '48825099999', ...",Hanoi
4,2019-01-05,1546621200,22.0,16.0,19.0,15.1,19.0,0.0,0.0,0.0,...,7.8,4.0,06:34:46,17:28:47,0.97,partly-cloudy-day,Partially cloudy,Partly cloudy throughout the day.,"['48820099999', '48823099999', '48825099999', ...",Hanoi


### Keep necessary features
Các đặc trưng thời tiết được lưu trữ từ file CSV của từng tỉnh bao gồm nhiều thông tin quan trọng, chẳng hạn như nhiệt độ, độ ẩm, áp suất, tốc độ gió,... Mục tiêu của việc tiền xử lý dưới đây là lưu trữ các cột cần thiết từ các file, đồng thời thêm thông tin tên tỉnh thành vào để dễ dàng phân tích.

In [4]:
location_prefix = ['LangSon', 'SonLa', 'HaNoi', 'NgheAn', 'HoChiMinh', 'BenTre', 'LamDong', 'DaNang']
location_address = ['Lạng Sơn', 'Sơn La', 'Hà Nội', 'Nghệ An', 'Hồ Chí Minh', 'Bến Tre', 'Lâm Đồng', 'Đà Nẵng']
location_address = {location_prefix[i] : location_address[i] for i in range(len(location_prefix))} 

save_cols = ["datetime" , "datetimeEpoch" , "tempmax"    ,
            "tempmin"   , "temp"          , "dew"        , "humidity",
            "precip"    , "precipprob"    , "precipcover",
            "preciptype", "windgust"      , "windspeed"  ,
            "winddir"   , "pressure"      , "cloudcover" ,
            "visibility", "solarradiation", "solarenergy",
            "uvindex"   , "moonphase"]

for location in location_prefix: 
    raw_df = pd.read_csv(f'./../data/input_data/{location}.csv')

    # Reset the columns name by lower the first letter of each column name
    raw_df.rename(columns=lambda col: col[0].lower() + col[1:], inplace=True)

    # Remove unnecessarry columns (Keep necessary columns) 
    raw_df = raw_df[save_cols]

    # Renaming the columns name by upper the first letter of each column name
    raw_df.rename(columns=lambda col: col[0].upper() + col[1:], inplace=True)

    # Create new address list 
    row_count = len(raw_df)
    address_list = np.full(row_count, location_address[location])

    # Add new address column to the data frame
    raw_df = raw_df.assign(Address = pd.Series(address_list))

    # Rearranging columns 
    col_order = raw_df.columns.tolist()
    col_order = col_order[-1:] + col_order[:-1]
    raw_df = raw_df[col_order]

    raw_df.to_csv(f'./../data/output_data/{location}.csv', index=False)

### Result
Kết quả sau khi lưu trữ các đặc trưng thời tiết của những tỉnh thành đại diện cho các vùng miền trên cả nước Việt Nam là 8 file CSV:

In [5]:
raw_df = pd.read_csv(f'./../data/output_data/HaNoi.csv')
raw_df.head(5)

Unnamed: 0,Address,Datetime,DatetimeEpoch,Tempmax,Tempmin,Temp,Dew,Humidity,Precip,Precipprob,...,Windgust,Windspeed,Winddir,Pressure,Cloudcover,Visibility,Solarradiation,Solarenergy,Uvindex,Moonphase
0,Hà Nội,2019-01-01,1546275600,13.0,11.0,11.8,5.3,64.5,0.0,0.0,...,26.3,19.4,13.3,1028.4,88.3,10.0,50.8,4.3,2.0,0.83
1,Hà Nội,2019-01-02,1546362000,17.0,11.5,14.1,7.0,62.8,0.1,100.0,...,22.3,17.5,0.9,1026.9,86.5,9.4,43.1,3.8,2.0,0.87
2,Hà Nội,2019-01-03,1546448400,18.1,13.0,15.3,9.0,66.3,0.01,100.0,...,16.6,13.0,39.0,1024.5,80.4,10.0,45.5,3.8,2.0,0.9
3,Hà Nội,2019-01-04,1546534800,19.0,13.0,16.3,12.4,78.1,0.878,100.0,...,19.1,10.3,53.0,1022.3,82.1,8.5,47.4,3.9,2.0,0.94
4,Hà Nội,2019-01-05,1546621200,22.0,16.0,19.0,15.1,79.5,0.0,0.0,...,18.4,11.2,38.8,1020.5,76.3,7.3,89.3,7.8,4.0,0.97


In [6]:
raw_df = pd.read_csv(f'./../data/output_data/NgheAn.csv')
raw_df.head(5)

Unnamed: 0,Address,Datetime,DatetimeEpoch,Tempmax,Tempmin,Temp,Dew,Humidity,Precip,Precipprob,...,Windgust,Windspeed,Winddir,Pressure,Cloudcover,Visibility,Solarradiation,Solarenergy,Uvindex,Moonphase
0,Nghệ An,2019-01-01,1546275600,15.0,12.9,14.0,10.3,78.7,0.0,0.0,...,36.0,20.9,335.4,1026.4,100.0,19.9,52.1,4.6,2.0,0.83
1,Nghệ An,2019-01-02,1546362000,15.8,12.2,14.3,10.8,79.5,0.9,100.0,...,32.4,18.7,335.4,1025.8,99.9,15.0,38.4,3.3,2.0,0.87
2,Nghệ An,2019-01-03,1546448400,17.7,14.5,16.1,13.1,82.6,0.4,100.0,...,27.7,15.5,332.2,1023.7,99.1,20.0,63.8,5.5,3.0,0.9
3,Nghệ An,2019-01-04,1546534800,20.1,16.2,17.9,15.2,84.1,0.1,100.0,...,23.8,13.0,331.4,1021.6,99.6,20.0,50.2,4.3,2.0,0.94
4,Nghệ An,2019-01-05,1546621200,21.7,17.9,19.5,17.1,85.8,0.0,0.0,...,22.3,12.6,338.2,1019.8,99.4,20.0,44.1,3.9,2.0,0.97


In [7]:
raw_df = pd.read_csv(f'./../data/output_data/HoChiMinh.csv')
raw_df.head(5)

Unnamed: 0,Address,Datetime,DatetimeEpoch,Tempmax,Tempmin,Temp,Dew,Humidity,Precip,Precipprob,...,Windgust,Windspeed,Winddir,Pressure,Cloudcover,Visibility,Solarradiation,Solarenergy,Uvindex,Moonphase
0,Hồ Chí Minh,2019-01-01,1546275600,28.0,24.0,26.1,18.9,64.9,3.1,100.0,...,35.6,21.1,2.7,1012.2,59.8,10.3,72.0,6.3,4.0,0.83
1,Hồ Chí Minh,2019-01-02,1546362000,25.4,24.0,24.7,21.0,79.7,16.1,100.0,...,47.2,16.2,11.1,1011.7,61.5,9.4,52.4,4.5,3.0,0.87
2,Hồ Chí Minh,2019-01-03,1546448400,30.6,23.4,27.1,21.7,72.9,0.1,100.0,...,59.8,23.1,64.1,1011.9,58.6,10.1,154.4,13.3,6.0,0.9
3,Hồ Chí Minh,2019-01-04,1546534800,34.0,25.5,29.4,21.8,65.3,0.2,100.0,...,56.2,29.5,95.3,1012.5,43.4,10.3,209.4,18.2,7.0,0.94
4,Hồ Chí Minh,2019-01-05,1546621200,33.6,25.0,29.1,21.0,63.4,0.0,0.0,...,30.6,25.5,109.2,1012.3,32.6,10.3,233.9,20.2,8.0,0.97


In [8]:
raw_df = pd.read_csv(f'./../data/output_data/BenTre.csv')
raw_df.head(5)

Unnamed: 0,Address,Datetime,DatetimeEpoch,Tempmax,Tempmin,Temp,Dew,Humidity,Precip,Precipprob,...,Windgust,Windspeed,Winddir,Pressure,Cloudcover,Visibility,Solarradiation,Solarenergy,Uvindex,Moonphase
0,Bến Tre,2019-01-01,1546275600,27.7,24.0,25.8,19.2,66.9,2.7,100.0,...,36.4,18.4,3.2,1012.5,57.3,10.9,81.2,7.0,3.0,0.83
1,Bến Tre,2019-01-02,1546362000,25.0,23.5,24.1,20.8,81.5,26.5,100.0,...,56.9,14.4,3.5,1012.0,66.1,9.0,38.0,3.2,1.0,0.87
2,Bến Tre,2019-01-03,1546448400,28.3,23.7,26.2,21.8,77.1,0.1,100.0,...,62.6,17.8,47.6,1012.0,65.6,9.7,109.7,9.5,5.0,0.9
3,Bến Tre,2019-01-04,1546534800,32.1,25.5,28.3,22.6,72.4,1.2,100.0,...,60.1,30.3,88.8,1012.6,50.4,10.7,196.2,17.0,7.0,0.94
4,Bến Tre,2019-01-05,1546621200,32.6,25.0,28.2,22.0,70.2,0.0,0.0,...,45.4,27.3,97.8,1012.6,36.5,10.9,223.0,19.2,8.0,0.97


In [9]:
raw_df = pd.read_csv(f'./../data/output_data/LamDong.csv')
raw_df.head(5)

Unnamed: 0,Address,Datetime,DatetimeEpoch,Tempmax,Tempmin,Temp,Dew,Humidity,Precip,Precipprob,...,Windgust,Windspeed,Winddir,Pressure,Cloudcover,Visibility,Solarradiation,Solarenergy,Uvindex,Moonphase
0,Lâm Đồng,2019-01-01,1546275600,18.3,16.4,17.4,16.4,93.9,8.8,100.0,...,55.8,20.2,29.1,1017.2,100.0,,52.9,4.6,2.0,0.83
1,Lâm Đồng,2019-01-02,1546362000,19.4,16.9,17.6,16.7,94.7,5.4,100.0,...,56.9,15.8,43.2,1017.2,100.0,,78.0,6.7,3.0,0.87
2,Lâm Đồng,2019-01-03,1546448400,22.1,17.7,19.5,17.8,90.2,3.3,100.0,...,56.9,21.6,70.6,1017.8,96.8,,184.7,15.9,7.0,0.9
3,Lâm Đồng,2019-01-04,1546534800,21.5,18.1,19.5,17.8,90.5,3.1,100.0,...,46.8,16.2,78.1,1018.9,74.9,,173.0,14.9,6.0,0.94
4,Lâm Đồng,2019-01-05,1546621200,24.5,18.0,20.4,17.4,84.0,0.0,0.0,...,43.2,13.7,73.5,1018.3,61.1,,220.1,18.9,8.0,0.97


In [10]:
raw_df = pd.read_csv(f'./../data/output_data/DaNang.csv')
raw_df.head(5)

Unnamed: 0,Address,Datetime,DatetimeEpoch,Tempmax,Tempmin,Temp,Dew,Humidity,Precip,Precipprob,...,Windgust,Windspeed,Winddir,Pressure,Cloudcover,Visibility,Solarradiation,Solarenergy,Uvindex,Moonphase
0,Đà Nẵng,2019-01-01,1546275600,21.2,18.0,20.1,17.6,85.8,0.462,100.0,...,37.1,22.3,351.0,1022.0,83.9,7.8,25.9,2.3,1.0,0.83
1,Đà Nẵng,2019-01-02,1546362000,20.0,18.0,19.0,18.1,94.1,2.03,100.0,...,32.8,9.4,314.5,1020.9,76.1,5.9,37.7,3.1,2.0,0.87
2,Đà Nẵng,2019-01-03,1546448400,23.0,19.0,20.4,18.3,88.2,2.553,100.0,...,29.9,7.6,304.4,1020.2,78.8,8.0,34.5,3.1,1.0,0.9
3,Đà Nẵng,2019-01-04,1546534800,25.0,19.4,22.0,19.1,84.3,0.134,100.0,...,25.9,11.1,310.5,1019.6,74.0,8.7,63.9,5.6,3.0,0.94
4,Đà Nẵng,2019-01-05,1546621200,26.6,21.0,23.1,20.4,85.3,0.408,100.0,...,25.6,12.9,349.3,1018.3,69.3,9.4,84.9,7.1,3.0,0.97


In [11]:
raw_df = pd.read_csv(f'./../data/output_data/SonLa.csv')
raw_df.head(5)

Unnamed: 0,Address,Datetime,DatetimeEpoch,Tempmax,Tempmin,Temp,Dew,Humidity,Precip,Precipprob,...,Windgust,Windspeed,Winddir,Pressure,Cloudcover,Visibility,Solarradiation,Solarenergy,Uvindex,Moonphase
0,Sơn La,2019-01-01,1546275600,14.9,9.2,11.6,8.1,78.9,0.0,0.0,...,24.8,8.3,136.3,1026.7,95.1,18.8,86.9,7.5,4.0,0.83
1,Sơn La,2019-01-02,1546362000,16.6,10.4,13.0,9.4,79.4,0.0,0.0,...,24.1,7.2,135.3,1025.7,90.5,17.5,119.1,10.1,4.0,0.87
2,Sơn La,2019-01-03,1546448400,19.6,11.9,14.5,11.4,81.6,0.0,0.0,...,23.4,7.2,138.2,1023.6,86.3,8.8,118.8,10.3,4.0,0.9
3,Sơn La,2019-01-04,1546534800,22.2,12.0,16.3,12.9,81.3,0.0,0.0,...,18.0,7.6,129.1,1021.8,89.1,15.5,101.6,8.9,5.0,0.94
4,Sơn La,2019-01-05,1546621200,25.4,13.7,18.5,14.7,79.9,0.0,0.0,...,19.8,7.9,114.4,1019.8,93.2,18.0,136.8,11.8,6.0,0.97


In [12]:
raw_df = pd.read_csv(f'./../data/output_data/LangSon.csv')
raw_df.head(5)

Unnamed: 0,Address,Datetime,DatetimeEpoch,Tempmax,Tempmin,Temp,Dew,Humidity,Precip,Precipprob,...,Windgust,Windspeed,Winddir,Pressure,Cloudcover,Visibility,Solarradiation,Solarenergy,Uvindex,Moonphase
0,Lạng Sơn,2019-01-01,1546275600,8.1,5.8,7.0,3.6,79.2,0.1,100.0,...,45.7,20.9,26.3,1030.0,99.0,10.8,70.7,6.0,3.0,0.83
1,Lạng Sơn,2019-01-02,1546362000,16.2,6.6,9.9,5.2,73.0,0.0,0.0,...,41.4,19.1,20.7,1028.3,89.4,11.8,76.2,6.6,3.0,0.87
2,Lạng Sơn,2019-01-03,1546448400,11.7,8.5,9.9,8.2,89.4,0.499,100.0,...,28.1,12.2,30.1,1025.7,99.8,8.0,24.8,2.0,1.0,0.9
3,Lạng Sơn,2019-01-04,1546534800,13.8,9.9,11.9,10.9,93.9,5.192,100.0,...,18.7,8.6,25.3,1023.4,100.0,9.3,32.9,2.8,1.0,0.94
4,Lạng Sơn,2019-01-05,1546621200,16.0,11.7,13.5,12.4,93.2,0.904,100.0,...,25.6,13.0,15.2,1021.9,99.9,8.9,48.7,4.1,2.0,0.97
