-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.py
executable file
·299 lines (272 loc) · 11.9 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#! /usr/bin/python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dtime
import scipy.stats as st
import statsmodels.api as sm
from scipy.stats import t
def predictions(dataframe, cost_history_filename=None, plots=False):
'''
The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
Using the information stored in the dataframe, let's predict the ridership of
the NYC subway using linear regression with gradient descent.
'''
# Select Features (try different features!)
dummy_units = pd.get_dummies(dataframe['UNIT'], prefix='unit')
# days = map(lambda date_string: \
# dtime.datetime.strptime(date_string,"%m-%d-%y").weekday(), \
# dataframe.loc[:,'DATEn'])
dummy_days = pd.get_dummies(dataframe['day_week'], prefix='day')
dummy_hours = pd.get_dummies(dataframe['hour'], prefix='hour')
dummy_conds = pd.get_dummies(dataframe['conds'], prefix='conds')
features = dataframe[['precipi','fog','wspdi','pressurei']]
features, mu, sigma = normalize_features(features)
features['pressure2']=features['pressurei']*features['pressurei']
features = features[['precipi','fog','wspdi','pressure2']]
features = features.join(dummy_days).join(dummy_hours).join(dummy_units).join(dummy_conds)
# print "Not normalized features"
# print features.head()
# print "Normalized features"
# print features.head()
values = dataframe[['ENTRIESn_hourly']]
m = len(values)
if plots:
for feat in features.columns:
plt.figure()
# plt.scatter( values, features['precipi'])
plt.scatter(features[feat],values)
plt.savefig('scatterplots/'+feat)
plt.close()
features['ones'] = np.ones(m) # Add a column of 1s (y intercept)
# Convert features and values to numpy arrays
features_array = np.array(features)
values_array = np.array(values).flatten()
return features_array, values_array
def normalize_features(array):
"""
Normalize the features in the data set.
"""
array_normalized = (array-array.mean())/array.std()
mu = array.mean()
sigma = array.std()
return array_normalized, mu, sigma
def compute_cost(features, values, theta):
"""
Compute the cost function given a set of features / values,
and the values for our thetas.
This can be the same code as the compute_cost function in the lesson #3 exercises,
but feel free to implement your own.
"""
m = len(values)
sum_of_square_errors = np.square(np.dot(features, theta) - values).sum()
cost = sum_of_square_errors / (2*m)
return cost
def gradient_descent(features, values, theta, alpha, num_iterations):
"""
Perform gradient descent given a data set with an arbitrary number of features.
This can be the same gradient descent code as in the lesson #3 exercises,
but feel free to implement your own.
"""
m = len(values)
cost_history = []
X = np.matrix(features)
Y = np.matrix(values).transpose()
for i in range(num_iterations):
TH = np.matrix(theta).transpose()
XTY = X*TH-Y
cost = compute_cost(features, values, theta)
cost_history.append(cost)
TH = TH - alpha/m * (X.transpose()*XTY)
theta = np.array(TH.transpose())[0]
return theta, pd.Series(cost_history)
def plot_cost_history(alpha, cost_history, filename):
"""This function is for viewing the plot of your cost history."""
cost_df = pd.DataFrame({
'Cost_History': cost_history,
'Iteration': range(len(cost_history))
})
plt.figure()
plt.plot(cost_df['Iteration'], cost_df['Cost_History'])
plt.xlabel("Iteration")
plt.ylabel("Cost")
plt.savefig(filename)
def compute_r_squared(data, predictions):
SST = ((data-np.mean(data))**2).sum()
SSReg = ((predictions-np.mean(data))**2).sum()
r_squared = SSReg / SST
return r_squared
def plot_hourly_entries(turnstile_weather, filename):
'''
You are passed in a dataframe called turnstile_weather.
Use turnstile_weather along with ggplot to make a data visualization
focused on the MTA and weather data we used in assignment #3.
You should feel free to implement something that we discussed in class
(e.g., scatterplots, line plots, or histograms) or attempt to implement
something more advanced if you'd like.
Here are some suggestions for things to investigate and illustrate:
* Ridership by time of day or day of week
* How ridership varies based on Subway station
* Which stations have more exits or entries at different times of day
If you'd like to learn more about ggplot and its capabilities, take
a look at the documentation at:
https://pypi.python.org/pypi/ggplot/
You can check out:
https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
To see all the columns and data points included in the turnstile_weather
dataframe.
However, due to the limitation of our Amazon EC2 server, we are giving you about 1/3
of the actual data in the turnstile_weather dataframe
'''
df = turnstile_weather[['DATEn','hour','rain','ENTRIESn_hourly']]
# summing over all units
agg = df.groupby(['DATEn','hour','rain'], as_index= False).aggregate(np.sum)
# averaging over all days
agg = df.groupby(['hour', 'rain'], as_index = False).aggregate(np.mean)
rain = agg[agg.rain==1][['ENTRIESn_hourly','hour']]
clear = agg[agg.rain==0][['ENTRIESn_hourly','hour']]
plt.subplots_adjust(hspace=.4)
plt.subplot(211)
plt.title('Average hourly entries per unit')
w = .35
intervals = ['00-04','04-08','08-12','12-16','16-20','20-00']
hourly_rain = np.array(rain['ENTRIESn_hourly'])
hourly_rain = np.concatenate([hourly_rain[1:],[hourly_rain[0]]])
hourly_clear = np.array(clear['ENTRIESn_hourly'])
hourly_clear = np.concatenate([hourly_clear[1:],[hourly_clear[0]]])
plt.bar(np.array(range(len(intervals))) + w/2,
hourly_rain,
color='b', label='rainy', width=w)
plt.bar(np.array(range(len(intervals))) + w+w/2,
hourly_clear,
color='y', label='clear', width=w)
# print rain[['ENTRIESn_hourly','hour']]
# print hourly_rain
plt.xticks(np.array(range(len(intervals)))+w+w/2,intervals)
# plt.plot(clear['hour'], clear['ENTRIESn_hourly'], color='y', label='clear')
plt.xlabel('hour interval')
plt.ylabel('Average Entries')
plt.legend(loc=0)
plt.subplot(212)
plt.title('Average daily entries')
df = turnstile_weather[['DATEn','day_week','ENTRIESn_hourly']]
agg = df.groupby(['DATEn','day_week'], as_index=False).aggregate(np.sum)
agg2 = agg.groupby(['day_week'], as_index=False).aggregate(np.mean)
# rain = agg[agg.rain==1][['ENTRIESn_hourly','day_week']]
# clear = agg[agg.rain==0][['ENTRIESn_hourly','day_week']]
data = agg2[['ENTRIESn_hourly','day_week']]
day = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.bar(np.array(data['day_week'])+w/2, data['ENTRIESn_hourly'], width=w, color='b')
# plt.bar(np.array(clear['day_week'])+w, clear['ENTRIESn_hourly'], width=w, color='y', label='clear')
plt.xticks(np.array(range(7))+w,day)
plt.ylabel("Entries")
#clear['day_week'], clear['ENTRIESn_hourly']\
plt.savefig(filename)
plt.close()
plt.figure()
plt.title("Total Number of Entries by Day")
plt.xlabel("Date")
plt.ylabel("Total entries")
plt.bar(range(len(agg)), agg['ENTRIESn_hourly'])
plt.xticks(np.array(range(len(agg)))+0.5, agg['DATEn'], rotation='vertical')
plt.subplots_adjust(bottom=0.2, left = 0.15)
(x1,x2,y1,y2) = plt.axis()
plt.axis((0,31,y1,y2))
plt.savefig("daily entries.png")
plt.close()
#agg['hour'], agg['ENTRIESn_hourly']
def plot_entries_histogram(clear_w,rainy_w, n_bins):
plt.subplots_adjust(hspace=0.4)
plt.subplot(211)
plt.hist((clear_w, rainy_w), bins = n_bins, label=('clear','rainy'), color=('y','b'))
plt.title("Histogram of Hourly Entries (a)")
plt.xlabel("Entries")
plt.ylabel("count")
plt.legend()
plt.subplot(212)
normed_clear = np.ones_like(clear_w)/len(clear_w)
normed_rainy = np.ones_like(rainy_w)/len(rainy_w)
plt.hist((clear_w, rainy_w), weights=(normed_clear, normed_rainy), bins = n_bins, label=('clear','rainy'), color=('y','b'))
plt.title("Histogram of Hourly Entries (b)")
plt.xlabel("Entries")
plt.ylabel("Percent")
plt.legend()
plt.savefig("histogram of hourly entries")
plt.close()
def CV_experiment(features, values, log):
indeces = np.array(range(len(features)))
np.random.seed(1)
np.random.shuffle(indeces)
k_cv = 10
test_set_len = len(features)/k_cv
R_sq_array = []
for k in range(k_cv):
train_i = indeces[range(0,k*test_set_len)+range((k+1)*test_set_len,len(features))]
test_i = indeces[range(k*test_set_len,(k+1)*test_set_len)]
model = sm.OLS(values[train_i], features[train_i])
results = model.fit()
predicted_values = results.predict(features[test_i])
r_sq = compute_r_squared(values[test_i], predicted_values)
R_sq_array.append(r_sq)
log.write(str(r_sq)+'\n')
Rm = np.mean(R_sq_array)
Rsig = np.std(R_sq_array)
conf_interval = t.interval(.95,len(R_sq_array)-1,loc=Rm, scale=Rsig)
log.write("\nAverage R squared\n")
log.write(str(Rm))
log.write("\nR squared STD\n")
log.write(str(Rsig))
log.write("\nR squared 95% confidence interval:\n")
log.write(str(conf_interval))
def main():
log = open("NYC stats.txt", 'w')
log.write("NYC statistics\n\n")
turnstile_weather = pd.read_csv("turnstile_weather_v2.csv")
turnstile_weather['datetime'] = turnstile_weather['DATEn'] + ' ' + turnstile_weather['TIMEn']
clear_w = turnstile_weather[turnstile_weather.rain==0]['ENTRIESn_hourly']
rainy_w = turnstile_weather[turnstile_weather.rain==1]['ENTRIESn_hourly']
n_bins=40
plot_entries_histogram(clear_w, rainy_w,n_bins)
log.write("saved histogram of hourly entries, with "+str(n_bins)+" bins\n=============\n\n")
u,p = st.mannwhitneyu(clear_w, rainy_w)
log.write("Mann Whitney test (two tailed test):\np="+ str(2*p)+ "\nU="+ str(u)+"\n\n")
log.write("mean_clear="+ str(np.mean(clear_w)) + "\n")
log.write("mean_rainy="+ str(np.mean(rainy_w)) + "\n\n")
turnstile_weather['datetime'] = turnstile_weather['DATEn'] + ' ' + turnstile_weather['TIMEn']
plot_hourly_entries(turnstile_weather, "hourly entries.png")
#set plots=True in predictions to generate all scatterplots
features, values = predictions(turnstile_weather, plots=False)
model = sm.OLS(values, features)
results = model.fit()
predicted_values = results.predict(features)
r_squared = compute_r_squared(turnstile_weather['ENTRIESn_hourly'], predicted_values)
log.write ("R2 value: "+str(r_squared)+'\n\n')
# Theta values
theta = results.params
log.write("Theta values:\n")
log.write('precipi: '+str(theta[0])+'\n')
log.write('fog: '+str(theta[1])+'\n')
log.write('wspdi: '+str(theta[2])+'\n')
log.write('pressure2: '+str(theta[3])+'\n\n')
# k fold cross validation
log.write("cross validation experiment\n\nR squared:\n")
# uncomment the following line to start the experiment
# CV_experiment(features, values, log)
plt.figure()
plt.title("Model Accuracy")
plt.scatter(values, predicted_values)
plt.xlabel("Real Data (Number of entries)")
plt.ylabel("Modeled Data (Predicted number of entries)")
plt.plot(range(35000),range(35000), color='r')
plt.savefig("Accuracy.png")
plt.close()
plt.figure()
plt.title("Residuals")
plt.scatter(values, (values-predicted_values))
plt.xlabel("Real Data (Number of Entries)")
plt.ylabel("Residuals")
plt.savefig("Residuals")
plt.close()
log.close()
if __name__ == '__main__':
main()