###  Weather data for Seattle, WA from 2016
retrieved from NOAA Climate Data Online tool

## We will predict the temprature of Seattle ,WA depending on this data 

In [46]:
# Importing libraries
# Pandas is used for data manipulation
import pandas as pd
# Read in data and display first 5 rows
df = pd.read_csv('temps.csv')
df.head(5)

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
0,2016,1,1,Fri,45,45,45.6,45,43,50,44,29
1,2016,1,2,Sat,44,45,45.7,44,41,50,44,61
2,2016,1,3,Sun,45,44,45.8,41,43,46,47,56
3,2016,1,4,Mon,44,41,45.9,40,44,48,46,53
4,2016,1,5,Tues,41,40,46.0,44,46,46,46,41


In [47]:
#Checking the shape of our data that how many row and coloumns we hai 
print('The shape of our features is:', df.shape)
#so we have 348 rows and 12 coloumns in our data

The shape of our features is: (348, 12)


In [48]:
# check if there is Nan values in here our data
df.isnull().sum()
#there is no Nan value in our data 

year              0
month             0
day               0
week              0
temp_2            0
temp_1            0
average           0
actual            0
forecast_noaa     0
forecast_acc      0
forecast_under    0
friend            0
dtype: int64

In [49]:
# getting some information of our data 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   year            348 non-null    int64  
 1   month           348 non-null    int64  
 2   day             348 non-null    int64  
 3   week            348 non-null    object 
 4   temp_2          348 non-null    int64  
 5   temp_1          348 non-null    int64  
 6   average         348 non-null    float64
 7   actual          348 non-null    int64  
 8   forecast_noaa   348 non-null    int64  
 9   forecast_acc    348 non-null    int64  
 10  forecast_under  348 non-null    int64  
 11  friend          348 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 32.8+ KB


- The information is in the tidy data format with each row forming one observation, with the variable values in the columns.


Following are explanations of the columns:
- year: 2016 for all data points
- month: number for month of the year
- day: number for day of the year
- week: day of the week as a character string
- temp_2: max temperature 2 days prior
- temp_1: max temperature 1 day prior
- average: historical average max temperature
- actual: max temperature measurement
- friend: your friend’s prediction, a random number between 20 below the average and 20 above the average

In [50]:
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict we know this as y
labels = np.array(df['actual'])

In [51]:
# Remove the labels from the features
# axis 1 refers to the columns
df= df.drop('actual',axis = 1)
df= df.drop("forecast_noaa",axis = 1)
df= df.drop('friend', axis = 1)
df= df.drop('forecast_acc', axis = 1)
df= df.drop('forecast_under', axis = 1)
df= df.drop('average', axis = 1)
df= df.drop('week', axis = 1)
df.head(2)

Unnamed: 0,year,month,day,temp_2,temp_1
0,2016,1,1,45,45
1,2016,1,2,44,45


In [52]:
# checking the names of columns 
df_list = list(df.columns)
df_list

['year', 'month', 'day', 'temp_2', 'temp_1']

In [53]:
# X is features 
features = np.array(df)

## Splitting the data 

In [54]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets on 80/20 rule 
train_features, test_features, train_labels, test_labels = train_test_split(df, labels, test_size = 0.25, random_state = 42)

In [55]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (261, 5)
Training Labels Shape: (261,)
Testing Features Shape: (87, 5)
Testing Labels Shape: (87,)


# Preparing our Model

In [56]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [57]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

In [58]:
#Checking score of testing and training data 
print("Score for testing data =", rf.score(test_features, test_labels))
print("Score for training data =", rf.score(train_features, train_labels))

Score for testing data = 0.8077867523474935
Score for training data = 0.9716043782231992


In [59]:
# Checking the mean absolute error 
from sklearn import metrics
print (metrics.mean_absolute_error(test_labels, predictions))


4.0294252873563225


In [65]:
#rf.predict([[year, month , day , temp 2 ,temp 1]])
our_predict = rf.predict([[2018,5,3,46,48]])[0]
our_predict
# so according to our prediction 
# On 3 May 2018 if the temprature of a day before yesterday is 46 and yesterday is 48 , then today's temprature will be 49.8)



49.886

## Visualizing a Single Decision Tree

In [66]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = rf.estimators_[5]
tree

DecisionTreeRegressor(max_features='auto', random_state=1201263687)

In [67]:
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = df_list, rounded = True, precision = 1)

In [68]:
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

In [69]:
# Write graph to a png file
graph.write_png('tree.png')

As we see the density of that Tree no  5 is very high , so lets limit the sensity of the tree and plot a small graph 

In [70]:
# Limit depth of tree to 3 levels and 10 trees 
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(train_features, train_labels)


RandomForestRegressor(max_depth=3, n_estimators=10)

In [73]:
# Extract the small tree
tree_small = rf_small.estimators_[5]

In [72]:
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = df_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png')
