In [1]:
import sys
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_regression
from unbalanced_dataset.over_sampling import SMOTE
from sklearn.feature_selection import RFECV
from sklearn import decomposition
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
import csv
import math
import random
import pickle
import numpy as np

# Arguments

history_file = sys.argv[1]
master_file = sys.argv[2]
label = sys.argv[3]
out_file = sys.argv[4]

# Helpers

def dropColumns(data) : 
    toDrop = [x for x in data.columns.values if x.endswith(':raw')]
    toDrop += [x for x in data.columns.values if 'author' in x]
    # toDrop += [x for x in data.columns.values if 'date' in x and 'weighted' in x]
    # toDrop += [x for x in data.columns.values if 'others' in x]
    toDrop += [
        '__date',
        '__filename',
        '__changed',
        '_mostChanged',
        '_mostChanged25',
        '_mostChanged50',
        '_mostChanged75',
    ]

    return data.drop(toDrop, 1, errors='ignore')

def predictProbs(clf, test_data) :
    predictions_proba = clf.predict_proba(test_data).tolist()
    return [probs[1] for probs in predictions_proba]

def testMean(clf, data) : 
    pred = predictProbs(clf, data.drop('label', 1))
    return np.mean(pred) * 100

def testMeanDiff(clf, data) :
    return testMean(clf, data[data.label == '2']) - testMean(clf, data[data.label == '0'])
    
def train(train, test) : 
    smote = SMOTE(kind='regular', verbose=False)
    train_matrix, train_labels = smote.fit_transform(train.drop('label', 1), train.label)
    
    clf = RandomForestClassifier(n_estimators=5, n_jobs=3, criterion='entropy')
    clf.fit(train_matrix, train_labels)
    
    return [testMeanDiff(clf, test), clf]

# Get File

history_data = pd.read_csv(history_file)
master_data = pd.read_csv(master_file)

# Add Label

history_data['label'] = history_data['__changed'] + history_data[label]
history_data['label'] = history_data['label'].astype(str)

# Remove Columns

history_data = dropColumns(history_data)
dropped_master_data = dropColumns(master_data)

# Train set cut

clean_history = history_data[history_data.label == '0']
buggy_history = history_data[history_data.label == '2']

buggy_train_data = buggy_history
clean_train_data = clean_history[:min(math.floor(len(buggy_train_data) * 5), len(clean_history))]

train_data = pd.concat([buggy_train_data, clean_train_data])
train_data = train_data.reindex(np.random.permutation(train_data.index))
train_data = train_data.reset_index().drop(['index'], 1)

# Train

skf = StratifiedKFold(train_data.label, n_folds=10, shuffle=True)

models = []
for i in range(0, 10) :
    for train_index, test_index in skf:
        models.append(train(train_data.ix[train_index], train_data.ix[test_index]))
        
models.sort(key=lambda x: -x[0])
best_models = models[:20]

print(np.mean([x[0] for x in models][:10]))

# Predict using all models

results = []
for best_model in best_models :
    prediction = best_model[1].predict_proba(dropped_master_data).tolist()
    results.append([x[1] for x in prediction])

master_data['result'] = np.mean(results, axis=0)

# Predict

predictions = []
for index, row in master_data.iterrows() : 
    predictions.append([row.ix['__filename'], row.ix['result']])

# CSV

with open(out_file, 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(predictions)


68.1846405229


AttributeError: 'list' object has no attribute 'predict_proba'

AttributeError: 'str' object has no attribute '__filename'

In [15]:
for index, row in master_data.iterrows() : 
    print(row.ix['__filename'])
    print(row.ix['result'])

src/main/java/org/apache/commons/math3/Field.java
0.0
src/main/java/org/apache/commons/math3/RealFieldElement.java
0.04
src/main/java/org/apache/commons/math3/FieldElement.java
0.05
src/main/java/org/apache/commons/math3/package-info.java
0.0
src/main/java/org/apache/commons/math3/analysis/DifferentiableMultivariateFunction.java
0.04
src/main/java/org/apache/commons/math3/analysis/DifferentiableMultivariateVectorFunction.java
0.030000000000000006
src/main/java/org/apache/commons/math3/analysis/DifferentiableUnivariateMatrixFunction.java
0.030000000000000006
src/main/java/org/apache/commons/math3/analysis/DifferentiableUnivariateVectorFunction.java
0.030000000000000006
src/main/java/org/apache/commons/math3/analysis/DifferentiableUnivariateFunction.java
0.02
src/main/java/org/apache/commons/math3/analysis/MultivariateFunction.java
0.0
src/main/java/org/apache/commons/math3/analysis/MultivariateVectorFunction.java
0.02
src/main/java/org/apache/commons/math3/analysis/FunctionUtils.java
0.

In [9]:
master_data

Unnamed: 0,__changed,__date,__filename,_bytes,_lines,_mostChanged,_mostChanged25,_mostChanged50,_mostChanged75,authorChanges::bayard@apache.org:normalized,authorChanges::bayard@apache.org:raw,authorChanges::billbarker@apache.org:normalized,authorChanges::billbarker@apache.org:raw,authorChanges::brentworden@apache.org:normalized,authorChanges::brentworden@apache.org:raw,authorChanges::celestin@apache.org:normalized,authorChanges::celestin@apache.org:raw,authorChanges::danielsh@apache.org:normalized,authorChanges::danielsh@apache.org:raw,authorChanges::dbrosius@apache.org:normalized,Unnamed: 21
0.0,0,1377964716,src/main/java/org/apache/commons/math3/Field.java,2079,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
1.0,0,1377964716,src/main/java/org/apache/commons/math3/RealFie...,12749,403,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
2.0,0,1377964716,src/main/java/org/apache/commons/math3/FieldEl...,3155,88,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...
3.0,0,1377964716,src/main/java/org/apache/commons/math3/package...,905,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
4.0,0,1377964716,src/main/java/org/apache/commons/math3/analysi...,2249,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
5.0,0,1377964716,src/main/java/org/apache/commons/math3/analysi...,1389,37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
6.0,0,1377964716,src/main/java/org/apache/commons/math3/analysi...,1394,37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
7.0,0,1377964716,src/main/java/org/apache/commons/math3/analysi...,1396,37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
8.0,0,1377964716,src/main/java/org/apache/commons/math3/analysi...,1347,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
9.0,0,1377964716,src/main/java/org/apache/commons/math3/analysi...,1824,43,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
