## Importing the libraries

In [None]:
# pandas used to read csv data and cleaninh it
import pandas as pd
# sklearn.model_selection.train_test_split used to splite dataset to train and test sets
from sklearn.model_selection import train_test_split
# sklearn.tree.DecisionTreeClassifier used in implementing decission tree model
from sklearn.tree import DecisionTreeClassifier
# used to compute model accurecy
from sklearn.metrics import accuracy_score

## importing the dataset

[DataSet Link](https://drive.google.com/file/d/1xnpN_7GnfVQc285x6gkjVWvbyNrCn5g1/view?usp=sharing)

In [None]:
# reading daily_weather.csv data in data variable
data = pd.read_csv('/daily_weather.csv')

In [None]:
# printing dataset columes
data.columns

Index(['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

In [None]:
# printing first 5 columes of the dataeset
data.head()

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
0,0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16
1,1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597
2,2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46
3,3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547
4,4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74


## Data preprocessing

In [None]:
# showing head of the rows that has null cells
data[data.isnull().any(axis=1)].head()

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
16,16,917.89,,169.2,2.192201,196.8,2.930391,0.0,0.0,48.99,51.19
111,111,915.29,58.82,182.6,15.613841,189.0,,0.0,0.0,21.5,29.69
177,177,915.9,,183.3,4.719943,189.9,5.346287,0.0,0.0,29.26,46.5
262,262,923.596607,58.380598,47.737753,10.636273,67.145843,13.671423,0.0,,17.990876,16.461685
277,277,920.48,62.6,194.4,2.751436,,3.869906,0.0,0.0,52.58,54.03


In [None]:
# deleting number culume of the dataset
del data['number']

In [None]:
# showing number of columes before deleting the rows that has null cells 
before_rows = data.shape[0]
print(before_rows)

1095


In [None]:
# deleting the rows that has null cells
data = data.dropna()

In [None]:
# showing number of columes before deleting the rows that has null cells 
after_rows = data.shape[0]
print(after_rows)

1064


In [None]:
# showing the number of deleted rows
before_rows - after_rows

31

In [None]:
data['relative_humidity_3pm'].head()

0    36.160000
1    19.426597
2    14.460000
3    12.742547
4    76.740000
Name: relative_humidity_3pm, dtype: float64

In [None]:
# editing the labeled data to predict high humidity or not
clean_data = data.copy()
clean_data['high_humidity_label'] = (clean_data['relative_humidity_3pm'] >24.99) *1
clean_data['high_humidity_label'].head()

0    1
1    0
2    0
3    0
4    1
Name: high_humidity_label, dtype: int64

In [None]:
# assigning labeled data to y
y = clean_data[['high_humidity_label']].copy()
y.head()

Unnamed: 0,high_humidity_label
0,1
1,0
2,0
3,0
4,1


In [None]:
clean_data['relative_humidity_3pm'].head()

0    36.160000
1    19.426597
2    14.460000
3    12.742547
4    76.740000
Name: relative_humidity_3pm, dtype: float64

In [None]:
y.head()

Unnamed: 0,high_humidity_label
0,1
1,0
2,0
3,0
4,1


In [None]:
# assigning data features to morning_features variable
morning_features = ['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am']

In [None]:
# assigning morning_features to x
x=clean_data[morning_features].copy()
x.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am'],
      dtype='object')

In [None]:
y.columns

Index(['high_humidity_label'], dtype='object')

## Splitting the dataset into the Training set and Test set

In [None]:
# splitting x and y to X_train,X_test,y_train,y_test
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=324)

## Feature scaling


In [None]:
# Standardizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training the Decision Tree Classification model on the Training set

In [None]:
# training the model on humidity_classifier
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes=10,random_state=0)
humidity_classifier.fit(X_train,y_train)

DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)

In [None]:
# showing the type of the humidity_classifier
type(humidity_classifier)

sklearn.tree._classes.DecisionTreeClassifier

In [None]:
# assigning the values of X_test prediction to y_predicted
y_predicted = humidity_classifier.predict(X_test)

In [None]:
# printing the first ten rows of y_predicted
y_predicted[:10]

array([0, 0, 1, 1, 1, 1, 1, 0, 1, 1])

In [None]:
# printing the first ten rows of the original y_test
y_test['high_humidity_label'][:10]

456     0
845     0
693     1
259     1
723     1
224     1
300     1
442     0
585     1
1057    1
Name: high_humidity_label, dtype: int64

In [None]:
# computing the test accuracy
accuracy_score(y_test,y_predicted)*100

90.05681818181817