# Prepare Data

In [15]:
field_indexes = {
    'Wban Number': 0,
    'YearMonthDay,Time': 1,
    'StationType': 2,
    'MaintenanceIndicator': 3,
    'SkyConditions': 4,
    'Visibility,Weather Type': 5,
    'DryBulb Temp': 6,
    'DewPoint Temp': 7,
    'WetBulb Temp': 9,
    '% Relative Humidity': 10,
    'WindSpeed (kt)': 11,
    'WindDirection': 12,
    'WindChar. Gusts (kt)': 13,
    'Valfor Wind Char.': 14,
    'StationPressure': 15,
    'PressureTendency': 16,
    'SeaLevel Pressure': 17,
    'RecordType': 18,
    'Precip.Total': 19
}

In [71]:
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn import metrics 

import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [72]:
f = open('./WeatherDirs/199610hourly.txt', 'rb')

In [73]:
# split by new lines
new_line = f.read().split('\n')

In [74]:
# split into rows, and take all except for the named row and last empty row
rows = [row.split(',') for row in new_line][1:][:-1]

In [75]:
def update_field(data, index, value):
    for i in range(len(rows)):
        if rows[i][index] == '-' or rows[i][index] == '' or rows[i][index] == '*** ':
            rows[i][index] = value
        else:
            rows[i][index] = float(rows[i][index])

In [76]:
# update fields
keeper_fields = ['WetBulb Temp', '% Relative Humidity', 'WindSpeed (kt)', 'WindDirection',\
                 'StationPressure', 'PressureTendency']

for field in keeper_fields:
    update_field(rows, field_indexes[field], 0)

In [77]:
keeper_field_indexes = [field_indexes[field] for field in keeper_fields]

In [78]:
# delete bad fields
for row in rows:
    # go through each row in reverse order
    for i in range(len(row) - 1, 0, -1):
        if i not in keeper_field_indexes:
            del row[i]
    
    # kill off first column as well
    del row[0]

# ML to get WetBulb Temperature

## Format Data

In [79]:
data    = np.array([row[1:] for row in rows])
answers = np.array([row[0]  for row in rows])

## Split Data

In [80]:
X_train, X_test, y_train, y_test = train_test_split(data, answers, test_size=0.2, random_state=42)

## Machine Learing

In [81]:
lin = linear_model.LinearRegression()

In [82]:
lin.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [83]:
lin_pred = lin.predict(X_test)
print "Accuracy: " + str(metrics.explained_variance_score(y_test, lin_pred) * 100 ) + "%"

Accuracy: 74.8853606202%


## Sample Prediction

``` Python
X_test[1] = [33.5, 45, 4, 0, 29.58] 
```
should yield 48.0

In [84]:
print y_test[1]
print lin.predict([X_test[1]])

36.0
[ 33.5635409]


As can be seen we are fairly close with this method. Not perfect, but in the right ball park.