###  Weather data for Seattle, WA from 2016
retrieved from NOAA Climate Data Online tool

In [59]:
# Importing libraries
# Pandas is used for data manipulation
import pandas as pd
# Read in data and display first 5 rows
df = pd.read_csv('temps.csv')
df.head(5)

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
0,2016,1,1,Fri,45,45,45.6,45,43,50,44,29
1,2016,1,2,Sat,44,45,45.7,44,41,50,44,61
2,2016,1,3,Sun,45,44,45.8,41,43,46,47,56
3,2016,1,4,Mon,44,41,45.9,40,44,48,46,53
4,2016,1,5,Tues,41,40,46.0,44,46,46,46,41


- The information is in the tidy data format with each row forming one observation, with the variable values in the columns.


Following are explanations of the columns:
- year: 2016 for all data points
- month: number for month of the year
- day: number for day of the year
- week: day of the week as a character string
- temp_2: max temperature 2 days prior
- temp_1: max temperature 1 day prior
- average: historical average max temperature
- actual: max temperature measurement
- friend: your friend’s prediction, a random number between 20 below the average and 20 above the average

In [80]:
#Checking the shape of our data that how many row and coloumns we hai 
print('The shape of our features is:', df.shape)
#so we have 348 rows and 12 coloumns in our data

The shape of our features is: (348, 16)


In [61]:
# check if there is Nan values in here our data
df.isnull().sum()
#there is no Nan value in our data 

year              0
month             0
day               0
week              0
temp_2            0
temp_1            0
average           0
actual            0
forecast_noaa     0
forecast_acc      0
forecast_under    0
friend            0
dtype: int64

In [62]:
# getting some information of our data 
df.info()
# here we have one catagorical value column . 
# As we are working in regression so we have to convert it into integer type by making the dummy data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   year            348 non-null    int64  
 1   month           348 non-null    int64  
 2   day             348 non-null    int64  
 3   week            348 non-null    object 
 4   temp_2          348 non-null    int64  
 5   temp_1          348 non-null    int64  
 6   average         348 non-null    float64
 7   actual          348 non-null    int64  
 8   forecast_noaa   348 non-null    int64  
 9   forecast_acc    348 non-null    int64  
 10  forecast_under  348 non-null    int64  
 11  friend          348 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 32.8+ KB


In [63]:
# Converting the catagorical values data using pandas get_dummies
df = pd.get_dummies(df)
# Display the first 5 rows of the last 12 columns
df.iloc[:,5:].head(5)

Unnamed: 0,average,actual,forecast_noaa,forecast_acc,forecast_under,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,45.6,45,43,50,44,29,1,0,0,0,0,0,0
1,45.7,44,41,50,44,61,0,0,1,0,0,0,0
2,45.8,41,43,46,47,56,0,0,0,1,0,0,0
3,45.9,40,44,48,46,53,0,1,0,0,0,0,0
4,46.0,44,46,46,46,41,0,0,0,0,0,1,0


In [64]:
# again checking the shape of our data 
df.shape

(348, 18)

In [65]:
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
labels = np.array(df['actual'])

# Remove the labels from the features
# axis 1 refers to the columns
df= df.drop('actual',axis = 1)
df= df.drop("forecast_noaa",axis = 1)
#'forecast_under','friend', 'week_Fri''week_Mon','week_Sat','week_Sun','week_Thurs','week_Tues','week_Wed')
# Convert to numpy array
features = np.array(df)

In [66]:
# Saving feature names for later use
df_list = list(df.columns)
df_list

['year',
 'month',
 'day',
 'temp_2',
 'temp_1',
 'average',
 'forecast_acc',
 'forecast_under',
 'friend',
 'week_Fri',
 'week_Mon',
 'week_Sat',
 'week_Sun',
 'week_Thurs',
 'week_Tues',
 'week_Wed']

In [67]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [68]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (261, 16)
Training Labels Shape: (261,)
Testing Features Shape: (87, 16)
Testing Labels Shape: (87,)


# Preparing our Model

In [69]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [70]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

In [71]:
#Checking score of testing and training data 
print("Score for testing data =", rf.score(test_features, test_labels))
print("Score for training data =", rf.score(train_features, train_labels))

Score for testing data = 0.8149088174655048
Score for training data = 0.9746126233418512


In [72]:
# Checking the mean absolute error 
from sklearn import metrics
print (metrics.mean_absolute_error(test_labels, predictions))

# Long way to do which we have done with sklearn library
# # Calculate the absolute errors
# errors = abs(predictions - test_labels)
# # Print out the mean absolute error (mae)
# print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

3.859586206896551


## Visualizing a Single Decision Tree

In [73]:

# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = rf.estimators_[5]
tree

DecisionTreeRegressor(max_features='auto', random_state=1201263687)

In [74]:
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = df_list, rounded = True, precision = 1)

In [75]:
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

In [76]:
# Write graph to a png file
graph.write_png('tree.png')

As we see the graph is so big even to visualize so lets make a new model with low no. of trees 

In [77]:
# Limit depth of tree to 3 levels and 10 trees 
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(train_features, train_labels)


RandomForestRegressor(max_depth=3, n_estimators=10)

In [78]:
# Extract the small tree
tree_small = rf_small.estimators_[5]

In [79]:
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = df_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png')