In [1]:
import pandas as pd
import numpy as np
import re, glob
from skimage import io
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.image import extract_patches_2d

In [2]:
csv_path =r'/home/garfield/Desktop/cmpt318/project/yvr-weather'
image_path =r'/home/garfield/Desktop/cmpt318/project/katkam-scaled'
reg_pattern = 'katkam\W([\d]+)'

def printValueCountsInEachColumn(df):
    headers = list(df)
    for header in headers:
        print(header + "\n")
        print(str(df[header].value_counts().index) + "\n\n")
        

def extract_date(path):
    match_reg_pattern = re.search(reg_pattern, path)
    if match_reg_pattern:
        return match_reg_pattern.group(1)

    
def extract_filename(path):
    match_reg_pattern = re.search(reg_pattern + '.jpg', path)
    if match_reg_pattern:
        return match_reg_pattern.group(0)


In [3]:
# The following two functions are adapted from 
# http://blog.yhat.com/posts/image-processing-with-scikit-image.html
def display_image(images_rgb):
#    plt.clf()
    plt.figure()
    
    i = 0
    for image in images_rgb:
        plt.subplot(1, len(images_rgb), i + 1)
        plt.axis('off')
        plt.imshow(image)
        i = i + 1
    
    plt.show()

    
def separate_image_layers(image_rgb):
    image_r, image_g, image_b = image_rgb.copy(), image_rgb.copy(), image_rgb.copy()
    # switch off other color layers to show isolated r, g, b layers
    image_r[:,:,(1,2)] = 0
    image_g[:,:,(0,2)] = 0
    image_b[:,:,(0,1)] = 0
    
    return image_r, image_g, image_b

In [4]:
def add_image(imagepath):
    image = io.imread(imagepath)
    image_r, image_g, _ = separate_image_layers(image)
    average = np.average([image_r, image_g])
    return average

In [5]:
# this function divides the image to random patches and takes the average value 
def add_image2(imagepath):
    image = io.imread(imagepath)
    patches = extract_patches_2d(image, (24, 32), max_patches=8, random_state=np.random.RandomState(0))
    average = np.average(patches)
    return average

In [6]:
# this crops out the sky and takes average
def add_image3(imagepath):
    image = io.imread(imagepath)
    sky = image[:48,:256,:]
    average = np.average(sky)
    return average

In [7]:
# this crops out the sky and takes average
def add_image4(imagepath):
    image = io.imread(imagepath)
    sky = image[:96,:256,:]
    patches = extract_patches_2d(sky, (12, 16), 
        max_patches=8, random_state=np.random.RandomState(0))
    road = image[150:,:50,:]
    trees = image[144:175,210:,:]
    sea = image[125:,100:200,:]
#    bottom = image[96:, 128:, :]
    
    average1 = np.average(patches)
    average2 = np.average(trees)
    average3 = np.average(road)
    average4 = np.average(sea)
    total = average1 + average2 + average3 + average4
    return total

In [8]:
# remove certainty items in weather column
def clean_description(string):    
    string = string.replace('Drizzle', 'Rain')
    
    remove_words = ['Heavy','Moderate','Mostly','Mainly',
        'Showers','Pellets','Fog','Freezing','nan']
    
    for i in range(2):
        if i == 1:
            words = string.replace(' ', ',').split(sep=',')

        else:
            words = string.split(sep=',')
            
    
        for word in words:        
            for r in remove_words:
                if word == r:
                    words.remove(r)

        if len(words) > 1 and (words[0] == words[1]):
            words.remove(words[1])
        string = ','.join(words)
    
    if string == "" or string =='Fog':
        return None
    else:
        return string

In [9]:
csv_files = glob.glob(csv_path + '/*.csv')
dataframes = []

for csv_file in csv_files:
    table = pd.read_csv(csv_file, sep=',', 
                        skiprows=16, parse_dates=[0])
    dataframes.append(table)
df = pd.concat(dataframes)
#printValueCountsInEachColumn(df)
#df

In [10]:
image_files = glob.glob(image_path + '/*.jpg')
image_df = pd.DataFrame({'path' : image_files})
image_df['filename'] = image_df['path'].apply(extract_filename)

image_df['Date/Time'] = pd.to_datetime(
    image_df['path'].apply(extract_date),
    infer_datetime_format=True
)

image_df['image'] = image_df['path'].apply(add_image3)

#image_df

In [11]:
# clean data
df.dropna(axis=1, how='all', inplace=True)
df = df.select(lambda x: not re.search('Quality|Chill|Hmdx', x), axis=1)
df = image_df[['Date/Time', 'image']].merge(df, how='left', on='Date/Time')
df.drop(labels=['Date/Time'], axis=1, inplace=True)
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M').dt.hour # keep hour only

# change column names
df.columns = ['image', 'year', 'month', 'day', 'hour',
              'temp', 'dew_temp', 'rel_hum', 'wind_dir',
              'wind_speed', 'visibility', 'pressure', 'weather']

# cast weather descriptions as strings
df['weather'] = df['weather'].astype('str') 

In [12]:
df['weather'] = df['weather'].apply(clean_description)

In [13]:
weather_described = df.dropna(axis=0, how='any') #select rows without null values

In [14]:
# create machine learning sets
X = weather_described[['image', 'month', 'hour', 'temp',
                       'dew_temp','rel_hum','wind_dir',
                       'wind_speed','visibility','pressure']]
y = weather_described['weather']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [15]:
bayes = make_pipeline(
    StandardScaler(),
    GaussianNB()
)

bayes.fit(X_train, y_train)
print(bayes.score(X_test,y_test))

0.709677419355


In [16]:
svc = make_pipeline(
        StandardScaler(),
        SVC(kernel='rbf', C=5)
    )
svc.fit(X_train, y_train)
print(svc.score(X_test,y_test))

0.752688172043


In [20]:
k_neighbour = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors=5)
)

k_neighbour.fit(X_train, y_train)
k_neighbour.score(X_test,y_test)

0.75268817204301075