In [176]:
%matplotlib inline
import math
import pandas as pd
import matplotlib as plt
import numpy as np
import seaborn as sns; sns.set(style='ticks', color_codes=True)

# from pandas.plotting import parallel_coordinates
from sklearn import preprocessing
from operator import itemgetter

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.externals import joblib

# pd.set_option('display.max_columns', 150)
pd.options.display.max_seq_items = 200

In [177]:
TRAIN_DATA = 'data/train.csv'
TEST_DATA = 'data/test.csv'
CLEAN_TRAIN_DATA = 'data/clean_train_data.csv'

NOISE_COLUMNS = ['Key', 'No', 'District', 'City', 'Hour', 'Minute', 'Date', 'Month', 'Year',
                'JunctionControl', 'dayInWeek', 'IsDayOrNight', 'CollisionDirection', 'DetailCause',
                'Tricycle', 'Weather', 'AccidentCause']
NOISE_TEST_COLUMNS = ['Key', 'District', 'City', 'Hour', 'Minute', 'Date', 'Month', 'Year',
                'JunctionControl', 'dayInWeek', 'IsDayOrNight', 'CollisionDirection', 'DetailCause',
                'Tricycle', 'Weather', 'AccidentCause']

NORMALIZE_COLUMNS = ['DriversKilled', 'DriversInjured', 'PassengerInjured', 'PassengerKilled', 'PedestrianKilled',
                    'PedestrianInjured', 'NumPedestrianVictim', 'Bus', 'Car', 'Jeepney', 'FxTaxi', 'Van', 'Truck',
                    'Train', 'UnknownVehicle']
ONE_HOT_COLUMNS = ['CollisionType', 'JunctionType']

In [178]:
#Normalization

def normalize_df(df_base, columns):
    df = df_base.copy()
    for column in columns:
        df[column] = normalize_max_unknown(df[column])
    
    return df

def normalize_max_unknown(vector):
    min = np.min(vector)
    max = np.max(vector)
        
    return [(x - float(min)) / (float(max) - float(min)) for x in vector]

In [179]:
#Data Visualization

def parallel_lines(df, target_category):
    plt.pyplot.figure(figsize=(30,10))

    parallel_plt = parallel_coordinates(df, target_category)
#     plt.pyplot.savefig('%s_parallel.png' % target_category)
    
def scatter_plot(df, target_category):
    scatter_plt = sns.pairplot(df, hue=target_category)
    scatter_plt.savefig('%s_scatter.png' % target_category)

In [180]:
def generate_arr(dataset, classification):
    classification_arr = dataset[classification].values
    del dataset[classification]
    dataset_arr = dataset.values

    return dataset_arr, classification_arr

def impute_columns(df_base, columns):
    df = df_base.copy()
    columns = df.columns
    for column in columns:
        df[column] = df[column].fillna(0.0)
        
    return df

In [181]:
mmda_df = pd.read_csv(TRAIN_DATA)
mmda_df = mmda_df.drop(NOISE_COLUMNS, axis=1)

mmda_df_test = pd.read_csv(TEST_DATA)
mmda_df_test = mmda_df_test.drop(NOISE_TEST_COLUMNS, axis=1)

In [182]:
mmda_df_len = len(mmda_df.index)
mmda_df_combined = mmda_df.append(mmda_df_test)

mmda_df_one_hot = pd.get_dummies(mmda_df_combined, prefix=ONE_HOT_COLUMNS, columns=ONE_HOT_COLUMNS)

mmda_df = mmda_df_one_hot.iloc[:mmda_df_len,:]
mmda_df_test = mmda_df_one_hot.iloc[mmda_df_len:,:]

In [183]:
# imputer = Imputer(strategy='most_frequent', axis=1)
# imputer.fit(mmda_df)
# imputer_result = imputer.transform(mmda_df)

# mmda_df_clean = pd.DataFrame(imputer_result, columns=mmda_df.columns)
mmda_df_clean = impute_columns(mmda_df, NORMALIZE_COLUMNS)
mmda_df_clean = normalize_df(mmda_df_clean, NORMALIZE_COLUMNS)

# mmda_df_clean = mmda_df_clean.drop('AccidentCause')

mmda_df_clean.to_csv(CLEAN_TRAIN_DATA)

# mmda_df_clean = pd.read_csv(CLEAN_TRAIN_DATA)
# corr_matrix = mmda_df_clean.corr()
# corr_matrix['Classification_1'].sort_values(ascending=False)

In [184]:
# mmda_arr, classification_arr = generate_arr(mmda_df_clean, 'Classification')

classification_arr = mmda_df_clean['Classification'].values
mmda_arr = mmda_df_clean.drop('Classification', axis=1).values
# labels = [[x] for x in classification_arr]
# binary_labels = MultiLabelBinarizer().fit_transform(labels)
# binary_labels

In [185]:
clf = GaussianNB()
clf.fit(mmda_arr, classification_arr)
joblib.dump(clf, 'mmda_predictor.pkl')

['mmda_predictor.pkl']

In [187]:
# imputer_test = Imputer(strategy='most_frequent', axis=1)
# imputer_test.fit(mmda_df_test)
# imputer_result_test = imputer.transform(mmda_df_test)

# mmda_df_clean_test = pd.DataFrame(imputer_result_test, columns=mmda_df_test.columns)
mmda_df_clean_test = impute_columns(mmda_df_test, NORMALIZE_COLUMNS)
mmda_df_clean_test = normalize_df(mmda_df_clean_test, NORMALIZE_COLUMNS)

# mmda_df_clean_test = mmda_df_clean_test.drop('AccidentCause', axis=1)

mmda_test_arr = mmda_df_clean_test.drop('Classification', axis=1).values

In [204]:
pred = clf.predict(mmda_test_arr)

mmda_arr_len = len(mmda_arr)
mmda_test_arr_len = len(mmda_test_arr)

key_arr = list(range(mmda_arr_len + 1, mmda_arr_len + mmda_test_arr_len + 1))
classification_arr = [int(x) for x in pred]

print key_arr[0]
print key_arr[-1]
print len(key_arr)
print len(classification_arr)

index = ['Key', 'Classification']
d = {'Key':key_arr, 'Classification': classification_arr}

pred_df = pd.DataFrame(d)
pred_df.to_csv('mmda_prediction.csv', index = False, header=True, columns=['Key', 'Classification'])

940864
957348
16485
16485
