In [1]:
import pandas as pd
import numpy as np
import re, glob
from skimage import io
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
csv_path =r'/home/garfield/Desktop/cmpt318/project/yvr-weather'
image_path =r'/home/garfield/Desktop/cmpt318/project/katkam-scaled'
reg_pattern = 'katkam\W([\d]+)'

def printValueCountsInEachColumn(df):
    headers = list(df)
    for header in headers:
        print(header + "\n")
        print(str(df[header].value_counts().index) + "\n\n")
        

def extract_date(path):
    match_reg_pattern = re.search(reg_pattern, path)
    if match_reg_pattern:
        return match_reg_pattern.group(1)

    
def extract_filename(path):
    match_reg_pattern = re.search(reg_pattern + '.jpg', path)
    if match_reg_pattern:
        return match_reg_pattern.group(0)


In [3]:
def add_image(imagepath):
    image = io.imread(imagepath)
    #titles = "red layer", "green layer", "blue layer"
    image_r, image_g, _ = separate_image_layers(image)
    #display_image(image_rgb_layers,titles)
    average = np.average([image_r, image_g])
    #return image_r
    return average

In [4]:
# The following two functions are adapted from 
# http://blog.yhat.com/posts/image-processing-with-scikit-image.html
def display_image(images_rgb, titles):
#    plt.clf()
    plt.figure()
    
    i = 0
    for image, title in zip(images_rgb, titles):
        plt.subplot(1, len(images_rgb), i + 1)
        plt.title(title)
        plt.axis('off')
        plt.imshow(image)
        i = i + 1
    
    plt.show()

    
def separate_image_layers(image_rgb):
    image_r, image_g, image_b = image_rgb.copy(), image_rgb.copy(), image_rgb.copy()
    # switch off other color layers to show isolated r, g, b layers
    image_r[:,:,(1,2)] = 0
    image_g[:,:,(0,2)] = 0
    image_b[:,:,(0,1)] = 0
    
    return image_r, image_g, image_b

In [5]:
csv_files = glob.glob(csv_path + '/*.csv')
dataframes = []

for csv_file in csv_files:
    table = pd.read_csv(csv_file, sep=',', 
                        skiprows=16, parse_dates=[0])
    dataframes.append(table)
df = pd.concat(dataframes)
#printValueCountsInEachColumn(df)
#df

In [6]:
image_files = glob.glob(image_path + '/*.jpg')
image_df = pd.DataFrame({'path' : image_files})
image_df['filename'] = image_df['path'].apply(extract_filename)

image_df['Date/Time'] = pd.to_datetime(
    image_df['path'].apply(extract_date),
    infer_datetime_format=True
)

#image_df

In [7]:
df.dropna(axis=1, how='all', inplace=True)

In [8]:
df = df.select(lambda x: not re.search('Quality|^(Year|Month|Day|Time)|Chill|Hmdx', x), axis=1)

In [9]:
# clean data in dataframe
def cleanData(df):
    # change column names
    df.columns = ['path', 'filename', 'date', 'temp', 'dew_temp', 
                  'rel_hum', 'wind_dir','wind_speed',
                  'visibility','pressure','weather']
    
    # cast weather descriptions as strings
    df['weather'] = df['weather'].astype('str') 
    
    return df

In [10]:
df = image_df.merge(df, how='left', on='Date/Time')
df = cleanData(df)

In [11]:
# remove certainty items in weather column
def clean_description(string):    
    string = string.replace('Drizzle', 'Rain')
    
    remove_words = ['Heavy','Moderate','Mostly','Mainly',
        'Showers','Pellets','Fog','Freezing','nan']
    
    for i in range(2):
        if i == 1:
            words = string.replace(' ', ',').split(sep=',')

        else:
            words = string.split(sep=',')
            
    
        for word in words:        
            for r in remove_words:
                if word == r:
                    words.remove(r)

        if len(words) > 1 and (words[0] == words[1]):
            words.remove(words[1])
        string = ','.join(words)
    
    if string == "" or string =='Fog':
        return None
    else:
        return string

In [12]:
#print(clean_description(df['weather'].iloc[0]))
#print(len(clean_description(df['weather'].iloc[5039])))

#df['weather'].iloc[5039]
#df['weather']
df['weather'] = df['weather'].apply(clean_description)
#df['weather'].values

In [13]:
#pd.isnull(df).sum() > 0

In [14]:
df
#df['weather'].value_counts()

Unnamed: 0,path,filename,date,temp,dew_temp,rel_hum,wind_dir,wind_speed,visibility,pressure,weather
0,/home/garfield/Desktop/cmpt318/project/katkam-...,katkam-20170411140000.jpg,2017-04-11 14:00:00,10.1,6.1,76.0,29.0,30.0,48.3,101.39,
1,/home/garfield/Desktop/cmpt318/project/katkam-...,katkam-20160608210000.jpg,2016-06-08 21:00:00,12.9,7.2,68.0,10.0,16.0,32.2,101.17,
2,/home/garfield/Desktop/cmpt318/project/katkam-...,katkam-20170218110000.jpg,2017-02-18 11:00:00,5.8,3.7,86.0,28.0,11.0,24.1,99.64,
3,/home/garfield/Desktop/cmpt318/project/katkam-...,katkam-20170530180000.jpg,2017-05-30 18:00:00,12.6,10.7,88.0,11.0,21.0,24.1,101.41,
4,/home/garfield/Desktop/cmpt318/project/katkam-...,katkam-20170327060000.jpg,2017-03-27 06:00:00,6.8,6.1,95.0,10.0,12.0,19.3,101.32,Rain
5,/home/garfield/Desktop/cmpt318/project/katkam-...,katkam-20160611120000.jpg,2016-06-11 12:00:00,15.4,8.2,62.0,17.0,4.0,24.1,102.17,
6,/home/garfield/Desktop/cmpt318/project/katkam-...,katkam-20170603170000.jpg,2017-06-03 17:00:00,16.1,8.6,61.0,19.0,22.0,48.3,101.61,
7,/home/garfield/Desktop/cmpt318/project/katkam-...,katkam-20170113160000.jpg,2017-01-13 16:00:00,0.4,-4.4,70.0,20.0,3.0,48.3,103.14,Cloudy
8,/home/garfield/Desktop/cmpt318/project/katkam-...,katkam-20170323170000.jpg,2017-03-23 17:00:00,10.9,2.3,55.0,11.0,18.0,48.3,101.23,Rain
9,/home/garfield/Desktop/cmpt318/project/katkam-...,katkam-20161121080000.jpg,2016-11-21 08:00:00,8.8,6.6,86.0,8.0,10.0,24.1,101.11,


In [15]:
#select rows without null values
weather_described = df.dropna(axis=0, how='any')

In [16]:
weather_described['image'] = weather_described['path'].apply(add_image)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [17]:
#weather_described['date'].dt.time.value_counts().index
#weather_described['date'].dt.month.value_counts().index


In [18]:
X = weather_described.drop(labels=['filename', 'path', 'weather'], axis=1)

#s = add_image(X['path'].iloc[10])
#print("The height of s is " + str(len(s)))
#print("The width of s is " + str(len(s[0])))

#print(s[0])

In [19]:
X.drop(['date'], axis=1, inplace=True)

y = weather_described['weather']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [21]:
bayes = make_pipeline(
    StandardScaler(),
    GaussianNB()
)

bayes.fit(X_train, y_train)
print(bayes.score(X_test,y_test))

0.707885304659


In [22]:
svc = make_pipeline(
        StandardScaler(),
        SVC(kernel='rbf', C=10)
    )
svc.fit(X_train, y_train)
print(svc.score(X_test,y_test))

0.736559139785


In [23]:
k_neighbour = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors=5)
)

k_neighbour.fit(X_train, y_train)
k_neighbour.score(X_test,y_test)

0.69892473118279574