In [35]:
# Daily weather data analysis using decision tress classifier 
# Importing the necessary libraries
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score 

In [11]:
data=pd.read_csv("../data/data_weather.csv")
print("Columns are: ", data.columns)
print("data: \n", data)
print("Null data: \n", data[data.isnull().any(axis=1)])

Columns are:  Index(['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')
data: 
       number  air_pressure_9am  air_temp_9am  avg_wind_direction_9am  \
0          0        918.060000     74.822000              271.100000   
1          1        917.347688     71.403843              101.935179   
2          2        923.040000     60.638000               51.000000   
3          3        920.502751     70.138895              198.832133   
4          4        921.160000     44.294000              277.800000   
...      ...               ...           ...                     ...   
1090    1090        918.900000     63.104000              192.900000   
1091    1091        918.710000     49.568000              241.600000   
1092    1092        916.600000     71.096

## Daily weather data description
The file daily_weather.csv is a comma separated file that contains weather data. This data comes from a weather station. The weather station is equipped with censors that capture weather related measurements such as air temperature, air pressure and relative humidity. Data was collected for a period of 3 years, from september 2011 to September 2014, to ensure that sufficient data for different seasons and weather conditions is captured.
Let us now check all teh columns of data.
* Each row in data_weather.csv captures weather data for a separate day.
* Sensor measurements from a weather station were captured at one-minute intervals. These measurements were then processed to generate values to describe daily weather. Since this dataset was created to classify low-humidity days Vs non-low-humidity days(that is, days with normal or high humidity), teh variables included are weather measurements in the morning, with one measurements namely relatively humidity, in the afternoon. The idea is to use the morning weather values to predict whether the day will be low humidity or not based on the afternoon measurements of relative humidity.
* Each row consists of the following variables:
    * number: unique number for each row
    * air_pressure_9am: air pressure averaged over a period from 8:55am to 9:04am(Unit: hectopascals)
    * air_temp_9am: air temperature averaged over a period from 8:55am to 9:04am(unit:fahreinheit)
    * air_wind_direction_9am: wind direction averaged over a period from 8:55am to 9:04am(Unit: degrees, with 0 means coming from the north, and increasing clockwise)
    * air_wind_speed_9am: wind speed averaged over a period from 8:55am to 9:04am(Unit:miles per hour)
    * max_wind_direction_9am: wind gust direction averaged over a period from 8:55am to 9:10am(Unit:degrees, with 0 means coming from the north, and increasing clockwise)
    * max_wind_speed_9am: wind gust speed averaged over a period from 8:55am to 9:04am (Unit: miles per hour)
    * rain_accumulation_9am: amount of rain accumulated in the 24 hours prior to 9am (Unit: millimeters)
    * rain_duration_9am: amount of time rain was recorded in the 24 hours prior to 9am (Unit: seconds)
    

## Data Cleaning Steps

In [12]:
# not need the number column
del data["number"]
# drop the null values
before_rows=data.shape[0]
print("before rows: ",before_rows)

data=data.dropna()
after_rows=data.shape[0]
print("after rows: ", after_rows)

print("total rows dropped= ", before_rows-after_rows)

before rows:  1095
after rows:  1064
total rows dropped=  31


In [19]:
# Convert to a classification task
# binarize the relative_humidity_3pm to 0 or 1

clean_data=data.copy()
clean_data["high_humidity_label"]=(clean_data["relative_humidity_3pm"]>24.99)*1

# print("Converted high humidity label:\n", clean_data["high_humidity_label"])

# target is stored in 'y'
y=clean_data[['high_humidity_label']].copy()
clean_data["relative_humidity_3pm"].head()

print("Y data:\n", y.head())

Y data:
    high_humidity_label
0                    1
1                    0
2                    0
3                    0
4                    1


### Use 9am sensor signals as features to predict humidity at 3pm



In [22]:
morning_features=["air_pressure_9am", "air_temp_9am", "avg_wind_direction_9am", "avg_wind_speed_9am", "max_wind_direction_9am", "max_wind_speed_9am", "rain_accumulation_9am", "rain_duration_9am"]

X=clean_data[morning_features].copy()
print("columns in X: ", X.columns)
print("Columns in Y: ", y.columns)

columns in X:  Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am'],
      dtype='object')
Columns in Y:  Index(['high_humidity_label'], dtype='object')


### Perform train and test splits
* In the training phase the learning algorithm uses the training data to adjust the model's parameters to minimize errors. At the end of the training phase, we get the trained model
* In the testing phase, the trained model is applied to the test data. Test data is separate from the training data, and is previously unseen by the model. The model is then evaluated on how it performs in test data. The goal in building a classifier model is to have the model perform well on training as well as test data.

In [26]:
X_train, X_test, Y_train, Y_test=train_test_split(X, y, test_size=0.33, random_state=324)
print("X_train is as under: ")
print(X_train.head())
print("X_test is as under: ")
print(X_test.head())
print("Y_train is as under: ")
print(Y_train.head())
print("Y_test is as under: ")
print(Y_test.head())

X_train is as under: 
     air_pressure_9am  air_temp_9am  avg_wind_direction_9am  \
841        918.370000     72.932000              184.500000   
75         920.100000     53.492000              186.100000   
95         927.610000     54.896000               55.000000   
895        919.235153     65.951112              194.343333   
699        919.888128     68.687822              228.517730   

     avg_wind_speed_9am  max_wind_direction_9am  max_wind_speed_9am  \
841            2.013246              186.700000            2.773806   
75            13.444009              193.800000           15.367778   
95             4.988376               53.400000            7.202947   
895            2.942019              216.569792            3.658810   
699            3.960858              247.954028            5.185547   

     rain_accumulation_9am  rain_duration_9am  
841                    0.0                0.0  
75                     0.0                0.0  
95                     0.0  

In [31]:
print("let us describe y train: ")
Y_train.describe()

# Fit on Training set
humidity_classifier=DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)

humidity_classifier.fit(X_train, Y_train)
type(humidity_classifier)

let us describe y train: 


sklearn.tree._classes.DecisionTreeClassifier

### Predict on the test set

In [32]:
predictions=humidity_classifier.predict(X_test)
print("Sample predictions: \n", predictions[:10])
print("Sample Y Test(Actual Data):\n", Y_test[:10])

Sample predictions: 
 [0 0 1 1 1 1 0 0 0 1]
Sample Y Test(Actual Data):
       high_humidity_label
456                     0
845                     0
693                     1
259                     1
723                     1
224                     1
300                     1
442                     0
585                     1
1057                    1


In [37]:
# Measure accuracy of the classifier
print("Accuracy: \n", accuracy_score(y_true=Y_test, y_pred=predictions))

Accuracy: 
 0.8153409090909091
