# Classification

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,age,workclass,education,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,under50k
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,under50k
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,under50k
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,under50k
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,under50k


## 독립변수

In [3]:
train.columns

Index(['age', 'workclass', 'education', 'marital', 'occupation',
       'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
       'hours_per_week', 'income'],
      dtype='object')

In [4]:
indep = train.columns[:-1]
indep

Index(['age', 'workclass', 'education', 'marital', 'occupation',
       'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
       'hours_per_week'],
      dtype='object')

## 형에 따른 구분 

In [5]:
discrete = []
continuous = []
for v in indep:
    if train[v].dtype == 'object':
        discrete.append(v)
    else:
        continuous.append(v)

In [6]:
discrete

['workclass', 'marital', 'occupation', 'relationship', 'race', 'sex']

In [7]:
continuous

['age', 'education', 'capital_gain', 'capital_loss', 'hours_per_week']

In [8]:
dummy = pd.get_dummies(train[discrete]) # 명목형 변수 Dummy 화 
X = pd.concat([train[continuous], dummy], axis=1)

In [9]:
X.head()

Unnamed: 0,age,education,capital_gain,capital_loss,hours_per_week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male
0,39,13,2174,0,40,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,50,13,0,0,13,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,38,9,0,0,40,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,53,7,0,0,40,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,28,13,0,0,40,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## 종속변수

In [10]:
dep = train.columns[-1]
y = train[dep]
y.head()

0    under50k
1    under50k
2    under50k
3    under50k
4    under50k
Name: income, dtype: object

## 데이터 분할

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [13]:
from sklearn import metrics

    Lasso 를 기준. 최소한 이것보단 높게.
    [[ 376  253]
     [ 125 1746]]
    accuracy: 0.8488
    precision: 0.750499001996
    recall: 0.597774244833
    f1: 0.665486725664

In [14]:
def getResult(y_test,y_pred):
    return metrics.f1_score(y_test, y_pred, pos_label='over50k')

## Logistic Regression : ridge

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
def logi(x,y): # x 패널티, y C값.
    LR = LogisticRegression(penalty=x,C=y)
    LR.fit(X_train,y_train)
    y_ridge = LR.predict(X_test)
    return LR,getResult(y_test,y_ridge)

In [20]:
result = []

In [21]:
penal = ['l1','l2']
for i in range(10,100,5):
    for j in penal:
        result.append(logi(j,i))

In [27]:
pd.DataFrame(result).sort_values(1).head(3)

Unnamed: 0,0,1
31,"LogisticRegression(C=85, class_weight=None, du...",0.658385
13,"LogisticRegression(C=40, class_weight=None, du...",0.658407
33,"LogisticRegression(C=90, class_weight=None, du...",0.659574


## Decision Tree
### Random Forest (랜덤포레스트)

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
def randomLoop(x):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(X_train,y_train)
    y_rf = rf.predict(X_test)
    
    return rf,getResult(y_test,y_rf)

In [30]:
for i in range(100,1000,100):
    result.append(randomLoop(i))

In [36]:
pd.DataFrame(result).sort_values(1,ascending=False).head(3)

Unnamed: 0,0,1
38,"(DecisionTreeClassifier(class_weight=None, cri...",0.672269
36,"(DecisionTreeClassifier(class_weight=None, cri...",0.671756
42,"(DecisionTreeClassifier(class_weight=None, cri...",0.671164


## Gradient Boosting Tree

In [37]:
from sklearn.ensemble import GradientBoostingClassifier

In [38]:
def gbt(x):
    gb = GradientBoostingClassifier(n_estimators=x)
    gb.fit(X_train,y_train)
    y_gb = gb.predict(X_test)
    return gb,getResult(y_test,y_gb)

In [39]:
for i in range(500,2000,100):
    result.append(gbt(i))

In [40]:
pd.DataFrame(result).sort_values(1,ascending=False).head(3)

Unnamed: 0,0,1
54,([DecisionTreeRegressor(criterion='friedman_ms...,0.719658
55,([DecisionTreeRegressor(criterion='friedman_ms...,0.719523
48,([DecisionTreeRegressor(criterion='friedman_ms...,0.719044


## SVM

In [41]:
from sklearn.svm import SVC

In [42]:
def run_model(kernel,penalty,cache):
    model = SVC(kernel=kernel, C=penalty,cache_size=cache)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model,getResult(y_test,y_pred)

In [43]:
for i in range(1,10):
    result.append(run_model('rbf',i,1000))

In [44]:
pd.DataFrame(result).sort_values(1,ascending=False).head(3)

Unnamed: 0,0,1
54,([DecisionTreeRegressor(criterion='friedman_ms...,0.719658
55,([DecisionTreeRegressor(criterion='friedman_ms...,0.719523
48,([DecisionTreeRegressor(criterion='friedman_ms...,0.719044


In [46]:
pd.DataFrame(result).sort_values(1,ascending=False)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [53]:
df = pd.DataFrame(result).sort_values(1,ascending=False)

In [54]:
df.head(3)

Unnamed: 0,0,1
54,([DecisionTreeRegressor(criterion='friedman_ms...,0.719658
55,([DecisionTreeRegressor(criterion='friedman_ms...,0.719523
48,([DecisionTreeRegressor(criterion='friedman_ms...,0.719044


## Best prediction 

In [55]:
df.ix[54,0] 

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=1400, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [56]:
df.ix[54,1]

0.71965811965811965