# Sklearn classifier

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

## Prepare

### Prepare data

- Country data:

    Merge `data/weo.csv` and `data/bli.csv` csv files, and got data table as following:

    <table width="80%" border="1" align="left" cellpadding="0" cellspacing="0">
        <thead>
            <tr>
                <td>Country</td>
                <td>GDP per capita</td>
                <td>Life satisfaction</td>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td>Australia</td>
                <td>50961.87</td>
                <td>7.3</td>
            </tr>
            <tr>
                <td>...</td>
                <td>...</td>
                <td>...</td>
            </tr>
        </tbody>
    </table>

In [None]:
import pandas as pd
import numpy as np

from os import curdir, path
from sklearn.datasets import load_iris

data_dir = path.abspath(path.join(curdir, 'data'))

oecd_bli_dir = path.join(data_dir, 'bli.csv')
gdp_per_capita_dir = path.join(data_dir, 'weo.csv')

oecd_bli = pd.read_csv(oecd_bli_dir, thousands=',', na_values="n/a")
gdp_per_capita = pd.read_csv(gdp_per_capita_dir, thousands=',', na_values="n/a")
print('* load "bli.csv" and "weo.csv"')

oecd_bli = oecd_bli[oecd_bli['INEQUALITY'] == 'TOT']
print('* filter "bli.csv" with "INEQUALITY" colume is "TOT"')

oecd_bli = oecd_bli.pivot(index='Country', columns='Indicator', values='Value')
print('* make pivo-table based on "bli.csv" data, every "Country", each "Indicator" as colume')

gdp_per_capita.rename(columns={'2015': 'GDP per capita'}, inplace=True)
print('* rename "2015" field to "GDP per capita" in "weo.csv" ')

gdp_per_capita.set_index("Country", inplace=True)
print('* set "Country" field as index in "weo.csv" ')

full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True)
print('* merge two data tables')

country_stats = full_country_stats[['GDP per capita', 'Life satisfaction']]
print('* keep "GDP per capita" and "Life satisfaction" columns')

# remove_indices = [0, 1, 6, 8, 33, 34, 35]
# keep_indices = list(set(range(36)) - set(remove_indices))
# country_stats = country_stats.iloc[keep_indices]

print('* country data load completed')

iris_dataset = load_iris()

iris_data = iris_dataset.data
iris_labels = iris_dataset.target
iris_classes = iris_dataset.target_names

print('* iris data loaded')

## Linear classifier

In [None]:
from sklearn import linear_model

print(country_stats.head(20))

# this is an ingenious method to convert a tensor from shape '(n,)' to shape'(n, 1)'
gdp_per_capita = np.c_[country_stats['GDP per capita']]
life_satisfaction = np.c_[country_stats['Life satisfaction']]

clf = linear_model.LinearRegression()
clf.fit(gdp_per_capita, life_satisfaction)

new_gdp = 22587
new_life_satisfaction = clf.predict(X=[[new_gdp]])
print('* GDP "{}" may got life satisfaction {:.2f}'.format(new_gdp, new_life_satisfaction[0][0]))

## Linear SVC classifier

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split

import random as rdm

x_train, x_test, y_train, y_test = train_test_split(iris_data, iris_labels, test_size=0.10, random_state=2)

# clf = svm.LinearSVC(multi_class='ovr', random_state=0, C=1, max_iter=2500)
clf = svm.SVC(kernel='linear', probability=True, random_state=0, C=1, max_iter=2500)
clf.fit(x_train, y_train)

results = clf.predict(x_test)
correct_prediction = [1 if r else 0 for r in y_test == results]
print('* correct prediction is: {:.2f}'.format(np.mean(correct_prediction)))

random_feature = rdm.choice(list(zip(x_test, y_test)))
actual_label = clf.predict([random_feature[0]])
print('* expected label is "{}" and actual_label is "{}"'.format(random_feature[1], actual_label[0]))
print('* class name is "{}"'.format(iris_classes[actual_label[0]]))

## KNN Classifier

In [None]:
from sklearn import neighbors as nb
from sklearn.model_selection import train_test_split

import random as rdm

x_train, x_test, y_train, y_test = train_test_split(iris_data, iris_labels, test_size=0.10, random_state=2)

clf = nb.KNeighborsClassifier(n_neighbors=1, algorithm='kd_tree', metric='minkowski')
clf.fit(x_train, y_train)

results = clf.predict(x_test)
correct_prediction = [1 if r else 0 for r in y_test == results]
print('* correct prediction is: {:.2f}'.format(np.mean(correct_prediction)))

random_feature = rdm.choice(list(zip(x_test, y_test)))
actual_label = clf.predict([random_feature[0]])
print('* expected label is "{}" and actual_label is "{}"'.format(random_feature[1], actual_label[0]))
print('* class name is "{}"'.format(iris_classes[actual_label[0]]))