In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

train = pd.read_csv("train_set.csv")
test = pd.read_csv("test_set.csv")

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 19 columns):
amount_tsh               59400 non-null float64
days_since_recorded      59400 non-null int64
funder                   59400 non-null object
installer                59400 non-null object
basin                    59400 non-null object
population               59400 non-null int64
public_meeting           59400 non-null object
scheme_management        59400 non-null object
permit                   59400 non-null object
construction_year        59400 non-null object
extraction_type_class    59400 non-null object
payment                  59400 non-null object
water_quality            59400 non-null object
quantity                 59400 non-null object
source                   59400 non-null object
source_class             59400 non-null object
waterpoint_type          59400 non-null object
waterpoint_type_group    59400 non-null object
status_group             59400 non-null object

In [11]:
dummy_cols = ['funder', 'installer', 'basin', 'public_meeting', 'scheme_management'
             , 'permit', 'construction_year', 'extraction_type_class', 'payment', 
              'water_quality', 'quantity', 'source', 'source_class', 'waterpoint_type',
             'waterpoint_type_group']
train = pd.get_dummies(train, columns= dummy_cols)
train = train.sample(frac=1).reset_index(drop=True)

In [13]:
train.shape

(59400, 94)

In [14]:
test = pd.get_dummies(test, columns = dummy_cols)

In [15]:
test.shape

(59400, 94)

In [24]:
test.head(2)

Unnamed: 0,amount_tsh,days_since_recorded,population,status_group,funder_danida,funder_hesawa,funder_other,funder_rwssp,funder_world_bank,installer_commu,...,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,6000.0,995,109,functional,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0.0,272,280,functional,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [25]:
target = train.status_group
features = train.drop('status_group', axis=1)

X_train, X_val, y_train, y_val = train_test_split(features, target, train_size=0.8)

In [28]:
def model(X_train, X_val, y_train, y_val, test):
    if __name__ == '__main__':

        pipe_svc = Pipeline([('scl', StandardScaler()),
                             ('clf', LinearSVC())])
    
        param_grid = {'clf__C':[0.001, 0.01, 0.1, 1.0],
                      'clf__class_weight':[None, 'balanced']}

        estimator = GridSearchCV(estimator=pipe_svc,
                                 param_grid=param_grid,
                                 n_jobs=-1)

        estimator.fit(X_train, y_train)

        best_params = estimator.best_params_
                                 
        validation_accuracy = estimator.score(X_val, y_val)
        print('Validation accuracy: ', validation_accuracy)
        print(best_params)

In [None]:
model(X_train, X_val, y_train, y_val, test)