In [69]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score,make_scorer,f1_score,classification_report,average_precision_score
from sklearn.preprocessing import Normalizer,MinMaxScaler,StandardScaler,normalize
from sklearn.cross_validation import train_test_split

In [2]:
pd.set_option('max_columns',1000)

In [3]:
#############################################
# PANDAS HELPERS
#############################################

def remove_column_from_data_frame(col_to_remove, data_frame):

    if col_to_remove in list(data_frame.columns):
        data_frame.drop(col_to_remove, axis=1, inplace=True)

        
def remove_columns_from_data_frame(cols_to_remove, data_frame):

    column_dict = {x: None for x in list(data_frame.columns)}

    cols_to_remove = [x for x in cols_to_remove if x in column_dict]

    data_frame.drop(labels=cols_to_remove, axis=1, inplace=True)
    

def remove_columns_like(column_pattern, data_frame):
    
    for column in list(data_frame.columns):
        if column_pattern in column:
            data_frame.drop(column, axis=1, inplace=True)


def fill_nas(value, data_frame):
    
    data_frame.fillna(0, inplace=True)

In [47]:
#############################################
# DATA RETRIEVAL HELPERS
#############################################

def get_data(n_rows=None):

    if n_rows is not None:
        df = pd.get_dummies(pd.read_csv('final_feats_without_dummies.csv', low_memory=False, nrows=n_rows))
        df_y = pd.get_dummies(pd.read_csv('final_outs.csv', low_memory=False, nrows=n_rows))
    else:
        df = pd.get_dummies(pd.read_csv('final_feats_without_dummies.csv', low_memory=False))
        df_y = pd.get_dummies(pd.read_csv('final_outs.csv', low_memory=False))
    
    
    # Drop labels and a redundant column
    remove_columns_from_data_frame(['Unnamed: 0', 'Unnamed: 0.1' 'dissent', 'dissentdummy'], df)
    
    # Extras -- for analysis
    # CASE 1: REMOVE TOP 2
    # CASE 2: REMOVE ALL 'DISS'
    
#     remove_columns_from_data_frame(['type', 'turnonthresh'], df)
#     remove_columns_from_data_frame(['type1', 'last3'], df)
#     remove_columns_like('diss', df)
    
    return df, df_y


def get_x_y(n_rows=None):
    
    df, df_y = get_data(n_rows)

    fill_nas(0, df)
    
    return df.values, df_y.ix[:,1].values


def get_columns():
    
    df = pd.get_dummies(pd.read_csv('final_feats_without_dummies.csv', low_memory=False, nrows=2))
    return list(df.columns)


def print_report(y, y_pred):

    print classification_report(y, y_pred)
    


In [96]:
#############################################
# MODEL HELPERS
#############################################

def grid_search(X, y, clf, param_grid):
    
#     param_dict={'average': 'weighted'}
    scorer = make_scorer(average_precision_score)


    gridclf = GridSearchCV(clf, paramgrid, scoring=scorer, cv=3, verbose=1)

    gridclf.fit(X, y)

    print gridclf.best_params_
    print gridclf.best_estimator_

    print_report(y_test, gridclf.predict(X_test))
    

def get_top_n(n, arr, col_names, prev_list=[]):
    
    if n <= 0:
        return []
    
    most_imp = -1
    most_imp_index = -1

    for i in range(len(arr)):

        if i in prev_list:
            continue

        if arr[i] > most_imp:
            most_imp = arr[i]
            most_imp_index = i

    prev_list.append(most_imp_index)

    return [ (col_names[most_imp_index], most_imp) ] + get_top_n(n - 1, arr, col_names, prev_list)

In [77]:
#############################################
# Read data into X and y
#############################################

X, y = get_x_y(1000)

In [78]:
#############################################
# Split into training and test set
#############################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [79]:
#############################################
# Standard scale
#############################################

scaler = StandardScaler()
scaler.fit(X_train)

X_test = scaler.transform(X_test)

In [81]:
#############################################
# [OPTIONAL]
# Random Forest Grid Search
#############################################

paramgrid = {'n_estimators': [10, 50, 100], 'max_depth': [1, 5, 10, 15]}

rf_clf = RandomForestClassifier(random_state=42)

grid_search(X_train, y_train, rf_clf, paramgrid)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
{'n_estimators': 50, 'max_depth': 5}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
             precision    recall  f1-score   support

        0.0       0.95      1.00      0.98       238
        1.0       0.00      0.00      0.00        12

avg / total       0.91      0.95      0.93       250



[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:    3.7s finished


In [46]:
#############################################
# Random Forest
#############################################

# Replace labels (in case SVM was run)
# y_train[y_train == 0.] = -1.
# y_test[y_test == 0.] = -1.

rf_clf = RandomForestClassifier(random_state=42, 
                                n_estimators=100, 
                                max_depth=15, 
#                                 class_weight={1.0: 1, -1.0: 150})
                                )

rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

print_report(y_test, y_pred)

             precision    recall  f1-score   support

         -1       0.73      0.49      0.58      2280
          1       0.98      0.99      0.99     53553

avg / total       0.97      0.97      0.97     55833



In [40]:
#############################################
# [OPTIONAL]
# Feature importance analysis
#############################################

top_n = get_top_n(10, rf_clf.feature_importances_, get_columns())

for t in top_n:
    print t


('type1', 0.085836245627072427)
('last3', 0.057453894299753429)
('close2', 0.013785569650024256)
('close3', 0.013742677538063049)
('diss0promerdummy', 0.0065234843229324416)
('unanimous', 0.0057345487383386635)
('dissent', 0.0047420628379996922)
('din', 0.0043813667302849586)
('concprodummy', 0.0036449179471683664)
('keytotal', 0.0035560335123791609)


In [97]:
#############################################
# [OPTIONAL]
# SVM Grid Search
#############################################

paramgrid = {'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 
             'degree': [1, 3, 5, 7, 9], 
             'coef0': [1e-3, 1e-1, 1e1, 1e3], 
             'max_iter': [1000], 
             'class_weight': [{1.0: 1, -1.0: 150}]}

svm_clf = SVC()

grid_search(X_train, y_train, svm_clf, paramgrid)

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    4.1s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:   15.5s


Fitting 3 folds for each of 80 candidates, totalling 240 fits
{'kernel': 'poly', 'max_iter': 1000, 'coef0': 1000.0, 'degree': 1, 'class_weight': {1.0: 1, -1.0: 150}}
SVC(C=1.0, cache_size=200, class_weight={1.0: 1, -1.0: 150}, coef0=1000.0,
  decision_function_shape=None, degree=1, gamma='auto', kernel='poly',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00        11
          1       0.98      1.00      0.99       489

avg / total       0.96      0.98      0.97       500



[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:   18.5s finished


In [65]:
#############################################
# SVM
#############################################

# Replace labels
# y_train[y_train == 0.] = -1.
# y_test[y_test == 0.] = -1.

svm_clf = SVC(kernel='rbf', max_iter=1000, coef0=1e-3, degree=2, class_weight={1.0: 1, -1.0: 150})

svm_clf.fit(X_train, y_train)

y_pred = svm_clf.predict(X_test)

print_report(y_test, y_pred)

             precision    recall  f1-score   support

         -1       0.00      0.00      0.00        11
          1       0.98      1.00      0.99       489

avg / total       0.96      0.98      0.97       500



In [13]:
# tdf = pd.get_dummies(pd.read_csv('final_outs.csv', low_memory=False))

# pd.unique(tdf.ix[:,1].values)

array([ 1, -1])