In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import classification_report,confusion_matrix
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('weatherHistory.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 12 columns):
Formatted Date              96453 non-null object
Summary                     96453 non-null object
Precip Type                 95936 non-null object
Temperature (C)             96453 non-null float64
Apparent Temperature (C)    96453 non-null float64
Humidity                    96453 non-null float64
Wind Speed (km/h)           96453 non-null float64
Wind Bearing (degrees)      96453 non-null float64
Visibility (km)             96453 non-null float64
Loud Cover                  96453 non-null float64
Pressure (millibars)        96453 non-null float64
Daily Summary               96453 non-null object
dtypes: float64(8), object(4)
memory usage: 8.8+ MB


In [4]:
data.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [5]:
data.describe()

Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars)
count,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0
mean,11.932678,10.855029,0.734899,10.81064,187.509232,10.347325,0.0,1003.235956
std,9.551546,10.696847,0.195473,6.913571,107.383428,4.192123,0.0,116.969906
min,-21.822222,-27.716667,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.688889,2.311111,0.6,5.8282,116.0,8.3398,0.0,1011.9
50%,12.0,12.0,0.78,9.9659,180.0,10.0464,0.0,1016.45
75%,18.838889,18.838889,0.89,14.1358,290.0,14.812,0.0,1021.09
max,39.905556,39.344444,1.0,63.8526,359.0,16.1,0.0,1046.38


In [6]:
data = data.dropna()

In [7]:
data.columns

Index(['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)',
       'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover',
       'Pressure (millibars)', 'Daily Summary'],
      dtype='object')

In [8]:
data.rename(columns = {'Temperature (C)':'temperature', 'Apparent Temperature (C)':'apparentTemp', 'Wind Speed (km/h)': 'windSpeed', 'Visibility (km)':'visibility', 'Pressure (millibars)':'pressure'}, inplace = True)

In [9]:
data.columns

Index(['Formatted Date', 'Summary', 'Precip Type', 'temperature',
       'apparentTemp', 'Humidity', 'windSpeed', 'Wind Bearing (degrees)',
       'visibility', 'Loud Cover', 'pressure', 'Daily Summary'],
      dtype='object')

In [10]:
le = LabelEncoder()
precip = data['Precip Type'].values
data['PrecipType2']= le.fit_transform(precip)

summary = data['Summary'].values
data['Summary2'] = le.fit_transform(summary)




In [11]:
y = data['apparentTemp']
x = data[['Humidity', 'windSpeed', 'visibility', 'pressure', 'PrecipType2', 'Summary2', 'Wind Bearing (degrees)']]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, shuffle = True)

In [13]:
lr = LinearRegression()
lr.fit(x_train, y_train)
print(lr.coef_)
print(lr.intercept_*100)

[-2.85763243e+01 -3.31782779e-01  1.77231834e-01 -1.07161454e-03
 -1.47501660e+01  5.40313573e-02  3.08347108e-03]
3489.975150161605


In [14]:
yPrediction = lr.predict(x_test)
score = round(lr.score(x_test, y_test), 4)
print('Accuracy of Model:', score*100,'%')

Accuracy of Model: 59.61 %


So now we see that our Linear Regression Model wasn't that accurate. However, to note, the score being given is the percent of time that the predicted value was exactly what was in the training model. So in the real world, while we may not be extremely accurate at predicting the exact value for the weather, we may have a much higher accuracy if we were to guess values within a range and get our estimates within the ball park

In [40]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [41]:
iterations = 10000
mlp = MLPRegressor(hidden_layer_sizes=(100, 100),max_iter = iterations, early_stopping = True)
mlp.fit(x_train, y_train)
predictions = mlp.predict(x_test)
score = round(mlp.score(x_test, y_test), 4)
print('Accuracy after', iterations,'iterations:', score*100, '%')


Accuracy after 10000 iterations: 73.02 %
