In [12]:
import argparse
import pandas as pd 
import numpy as np

from sklearn.linear_model import LogisticRegression

In [6]:
# %load ../runner/run_learning.py
# Thanks to https://github.com/rayidghani/magicloops/blob/master/magicloops.py
from __future__ import division
import pandas as pd
import numpy as np
from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition, svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import ParameterGrid
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
import random
import pylab as pl
import matplotlib.pyplot as plt
from scipy import optimize
import time
import seaborn as sns

# for jupyter notebooks
# %matplotlib inline

# if you're running this in a jupyter notebook, print out the graphs
NOTEBOOK = 1

import time                                                

def timeit(method):

    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()

        print '%r (%r, %r) %2.2f sec' % \
              (method.__name__, args, kw, te-ts)
        return result

    return timed


def define_clfs_params(grid_size):

    clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
        'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
        'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
        'LR': LogisticRegression(penalty='l1', C=1e5),
        'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
        'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
        'NB': GaussianNB(),
        'DT': DecisionTreeClassifier(),
        'SGD': SGDClassifier(loss="hinge", penalty="l2"),
        'KNN': KNeighborsClassifier(n_neighbors=3)
            }

    large_grid = {
    'RF':{'n_estimators': [1, 10, 100, 1000, 10000], 'max_depth': [1, 5, 10, 20, 50, 100], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10]},
    'LR': { 'penalty': ['l1', 'l2'], 'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]},
    'SGD': { 'loss': ['hinge', 'log', 'perceptron'], 'penalty': ['l2', 'l1', 'elasticnet']},
    'ET': { 'n_estimators': [1, 10, 100, 1000, 10000], 'criterion' : ['gini', 'entropy'] , 'max_depth': [1, 5, 10, 20, 50, 100], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10]},
    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1, 10, 100, 1000, 10000]},
    'GB': {'n_estimators': [1, 10, 100, 1000, 10000], 'learning_rate' : [0.001, 0.01, 0.05, 0.1, 0.5], 'subsample' : [0.1, 0.5, 1.0], 'max_depth': [1, 3, 5, 10, 20, 50, 100]},
    'NB' : {},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1, 5, 10, 20, 50, 100], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10]},
    'SVM' :{'C' :[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel':['linear']},
    'KNN' :{'n_neighbors': [1, 5, 10, 25, 50, 100], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree']}
           }
    
    semi_large_grid = {
    'RF':{'n_estimators': [1, 10, 100, 1000], 'max_depth': [1, 5, 10, 20, 50, 100], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10]},
    'LR': { 'penalty': ['l1', 'l2'], 'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]},
    'SGD': { 'loss': ['hinge', 'log', 'perceptron'], 'penalty': ['l2', 'l1', 'elasticnet']},
    'ET': { 'n_estimators': [1, 10, 100, 1000], 'criterion' : ['gini', 'entropy'] , 'max_depth': [1, 5, 10, 20, 50, 100], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10]},
    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1, 10, 100, 1000, 10000]},
    'GB': {'n_estimators': [1, 10, 100, 1000], 'learning_rate' : [0.001, 0.01, 0.05, 0.1, 0.5], 'subsample' : [0.1, 0.5, 1.0], 'max_depth': [1, 3, 5, 10, 20, 50, 100]},
    'NB' : {},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1, 5, 10, 20, 50, 100], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10]},
    'SVM' :{'C' :[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel':['linear']},
    'KNN' :{'n_neighbors': [1, 5, 10, 25, 50, 100], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree']}
    

    small_grid = {
    'RF':{'n_estimators': [10, 100], 'max_depth': [5, 50], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 10]},
    'LR': { 'penalty': ['l1', 'l2'], 'C': [0.00001, 0.001, 0.1, 1, 10]},
    'SGD': { 'loss': ['hinge', 'log', 'perceptron'], 'penalty': ['l2', 'l1', 'elasticnet']},
    'ET': { 'n_estimators': [10, 100], 'criterion' : ['gini', 'entropy'] , 'max_depth': [5, 50], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 10]},
    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1, 10, 100, 1000, 10000]},
    'GB': {'n_estimators': [10, 100], 'learning_rate' : [0.001, 0.1, 0.5], 'subsample' : [0.1, 0.5, 1.0], 'max_depth': [5, 50]},
    'NB' : {},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1, 5, 10, 20, 50, 100], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10]},
    'SVM' :{'C' :[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel':['linear']},
    'KNN' :{'n_neighbors': [1, 5, 10, 25, 50, 100], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree']}
           }

    test_grid = {
    'RF':{'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'], 'min_samples_split': [10]},
    'LR': { 'penalty': ['l1'], 'C': [0.01]},
    'SGD': { 'loss': ['perceptron'], 'penalty': ['l2']},
    'ET': { 'n_estimators': [1], 'criterion' : ['gini'] , 'max_depth': [1], 'max_features': ['sqrt'], 'min_samples_split': [10]},
    'AB': { 'algorithm': ['SAMME'], 'n_estimators': [1]},
    'GB': {'n_estimators': [1], 'learning_rate' : [0.1], 'subsample' : [0.5], 'max_depth': [1]},
    'NB' : {},
    'DT': {'criterion': ['gini'], 'max_depth': [1], 'max_features': ['sqrt'], 'min_samples_split': [10]},
    'SVM' :{'C' :[0.01], 'kernel':['linear']},
    'KNN' :{'n_neighbors': [5], 'weights': ['uniform'], 'algorithm': ['auto']}
           }

    if (grid_size == 'large'):
        return clfs, large_grid
    elif (grid_size == 'semi_large'):
        return clfs, semi_large_grid
    elif (grid_size == 'small'):
        return clfs, small_grid
    elif (grid_size == 'test'):
        return clfs, test_grid
    else:
        return 0, 0

def generate_binary_at_k(y_scores, k):
    cutoff_index = int(len(y_scores) * (k / 100.0))
    test_predictions_binary = [1 if x < cutoff_index else 0 for x in range(len(y_scores))]
    return test_predictions_binary

def precision_at_k(y_true, y_scores, k):
    preds_at_k = generate_binary_at_k(y_scores, k)
    #precision, _, _, _ = metrics.precision_recall_fscore_support(y_true, preds_at_k)
    #precision = precision[1]  # only interested in precision for label 1
    precision = precision_score(y_true, preds_at_k)
    return precision

def plot_precision_recall_n(y_true, y_prob, model_name):
    from sklearn.metrics import precision_recall_curve
    y_score = y_prob
    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)
    precision_curve = precision_curve[:-1]
    recall_curve = recall_curve[:-1]
    pct_above_per_thresh = []
    number_scored = len(y_score)
    for value in pr_thresholds:
        num_above_thresh = len(y_score[y_score >= value])
        pct_above_thresh = num_above_thresh / float(number_scored)
        pct_above_per_thresh.append(pct_above_thresh)
    pct_above_per_thresh = np.array(pct_above_per_thresh)

    plt.clf()
    fig, ax1 = plt.subplots()
    ax1.plot(pct_above_per_thresh, precision_curve, 'b')
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color='b')
    ax2 = ax1.twinx()
    ax2.plot(pct_above_per_thresh, recall_curve, 'r')
    ax2.set_ylabel('recall', color='r')
    ax1.set_ylim([0, 1])
    ax1.set_ylim([0, 1])
    ax2.set_xlim([0, 1])

    name = model_name
    plt.title(name)
    # plt.savefig(name)
    plt.show()


@timeit
def clf_loop(models_to_run, clfs, grid, X, y):
    results_df = pd.DataFrame(columns=('model_type', 'clf', 'parameters', 'auc-roc', 'p_at_5', 'p_at_10', 'p_at_100'))
    for n in range(1, 2):
        # create training and valdation sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            model_name = models_to_run[index]
            print model_name
            ts = time.time()
            parameter_values = grid[model_name]
            for p in ParameterGrid(parameter_values):
                try:
                    clf.set_params(**p)
                    y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]
                    # you can also store the model, feature importances, and prediction scores
                    # we're only storing the metrics for now
                    y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y_test), reverse=True))
                    results_df.loc[len(results_df)] = [model_name, clf, p,
                                                       roc_auc_score(y_test, y_pred_probs),
                                                       precision_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),
                                                       precision_at_k(y_test_sorted, y_pred_probs_sorted, 10.0),
                                                       precision_at_k(y_test_sorted, y_pred_probs_sorted, 100.0)]
                   # if NOTEBOOK == 1:
                        #plot_precision_recall_n(y_test, y_pred_probs, clf)
                except IndexError, e:
                    print 'Error:', e
                    continue
            te = time.time()
            print '%r  %2.2f sec' % (model_name, te-ts)
    return results_df



def start_from_text():

    grid_size = 'test'
    clfs, grid = define_clfs_params(grid_size)
    #models_to_run = ['RF', 'DT', 'KNN', 'ET', 'AB', 'GB', 'LR', 'NB']

    df = pd.read_csv("../data/train_data_features.csv")
    df.head(1)
    df['fake_news_score_binary'] =  df['fake_news_score'] == 3
    features  =  ['title_number_char', 'title_number_stopwords', 
                  'body_number_char', 'body_number_stopwords', ]

    df.head()
    X = df[features]
    X.head()
    y = df.fake_news_score_binary
    results_df = clf_loop(models_to_run, clfs, grid, X, y)
    if NOTEBOOK == 1:
        results_df
    results_df.head()
    results_df.to_csv('results.csv', index=False)
    print "The End"

def start_from_vector():
    pass
    
if __name__ == '__main__':
    #start_from_text()
    #start_from_vector()
    pass


SyntaxError: invalid syntax (<ipython-input-6-ef9178ad4956>, line 84)

In [4]:
grid_size = 'small'
clfs, grid = define_clfs_params(grid_size)
models_to_run = ['RF', 'DT', 'KNN', 'ET', 'GB', 'LR', 'NB']

df = pd.read_csv("../data/train_data_features.csv")
 

df['fake_news_score_binary'] =  df['fake_news_score'] == 3
features  =  ['title_number_char', 'title_number_stopwords', 'title_number_words', 
              'title_number_symbols',
              'body_number_char', 'body_number_stopwords', 'body_number_words', 'body_avg_char_per_word',
              'body_number_symbols']

df.head(2)
df.fillna(value=0, inplace=True)
X = df[features] 
y = df.fake_news_score_binary

NameError: name 'define_clfs_params' is not defined

In [None]:
results_df = clf_loop(models_to_run, clfs, grid, X, y)

NameError: name 'df' is not defined

In [8]:
data = pd.read_csv('../data/FN_Training_Set.csv', encoding='windows-1251')


In [14]:
data.shape

(2815, 6)

In [15]:
import numpy as np


In [16]:
x = np.zeros(10,15,5)

TypeError: data type not understood