In [13]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score,make_scorer,f1_score,classification_report
from sklearn.preprocessing import Normalizer,MinMaxScaler,StandardScaler,normalize
from sklearn.cross_validation import train_test_split

In [14]:
pd.set_option('max_columns',1000)

In [39]:
def remove_column_from_data_frame(col_to_remove, data_frame):

    if col_to_remove in list(data_frame.columns):
        data_frame.drop(col_to_remove, axis=1, inplace=True)

        
def remove_columns_from_data_frame(cols_to_remove, data_frame):

    column_dict = {x: None for x in list(data_frame.columns)}

    cols_to_remove = [x for x in cols_to_remove if x in column_dict]

    data_frame.drop(labels=cols_to_remove, axis=1, inplace=True)
    

def remove_columns_like(column_pattern, data_frame):
    
    for column in list(data_frame.columns):
        if column_pattern in column:
            data_frame.drop(column, axis=1, inplace=True)


def fill_nas(value, data_frame):
    
    data_frame.fillna(0, inplace=True)

In [64]:
def get_data():

    df = pd.get_dummies(pd.read_csv('final_feats_without_dummies.csv', low_memory=False))
    
    y_label_column = 'dissentdummy'
    
    # Extract labels
    df_y = df[y_label_column]
    
    # Drop labels and a redundant column
    remove_columns_from_data_frame(['Unnamed: 0', 'dissent', y_label_column], df)
    
    # Extras -- for testing
    remove_columns_from_data_frame(['unanimous', 'unan'], df)
    remove_columns_like('diss', df)
    
    return df, df_y


def get_x_y():
    
    df, df_y = get_data()

    fill_nas(0, df)
    
    return df.values, df_y.values


def print_report(y, y_pred):
    
    print accuracy_score(y, y_pred)

    print classification_report(y, y_pred)
    


In [65]:
#############################################
# Read data into X and y
#############################################

X, y = get_x_y()

In [66]:
#############################################
# Split into training and test set
#############################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [67]:
#############################################
# Normalize / Scale data
#############################################

scaler = StandardScaler()
scaler.fit(X_train)

X_test = scaler.transform(X_test)

In [163]:
#############################################
# [OPTIONAL]
# Grid Search
#############################################

paramgrid = {'n_estimators':[10, 50, 100],'max_depth':[1, 5, 10, 15]}

rf_clf = RandomForestClassifier(random_state=42)

scorer = make_scorer(accuracy_score)

gridclf = GridSearchCV(rf_clf,paramgrid,scoring=scorer,cv=3,verbose=1)

gridclf.fit(X_train, y_train)

print_report(y_test, gridclf.predict(X_test))

In [68]:
#############################################
# Random Forest
#############################################

rf_clf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15)

rf_clf.fit(X_train, y_train)

print_report(y_test, rf_clf.predict(X_test))

0.921229380474
             precision    recall  f1-score   support

        0.0       0.92      1.00      0.96     51435
        1.0       0.00      0.00      0.00      4398

avg / total       0.85      0.92      0.88     55833



  'precision', 'predicted', average, warn_for)


In [69]:
#############################################
# [OPTIONAL]
# Feature importance analysis
#############################################

def get_top_n(n, arr, col_names, prev_list=[]):
    
    if n <= 0:
        return []
    
    most_imp = -1
    most_imp_index = -1

    for i in range(len(arr)):

        if i in prev_list:
            continue

        if arr[i] > most_imp:
            most_imp = arr[i]
            most_imp_index = i

    prev_list.append(most_imp_index)

    return [ (col_names[most_imp_index], most_imp) ] + get_top_n(n - 1, arr, col_names, prev_list)



top_n = get_top_n(10, rf_clf.feature_importances_, list(df.columns))

for t in top_n:
    print t


('Wlengthopin', 0.019636283250984456)
('judrev', 0.017182957587283217)
('opp_wins', 0.016535501959704511)
('Wopinionlenght', 0.015455500062324837)
('sumappress', 0.011438626888425275)
('d13', 0.011411582414060347)
('Wtotalcites', 0.0098933474016059206)
('appel1', 0.0095519397479721466)
('month911_t0', 0.0095426932354118251)
('preg2', 0.0093337355678511941)
