In [63]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score
import csv

## Read train features jahnavi (with author features)

In [15]:
train_features = np.genfromtxt('features_train_unscaled_authors.csv', delimiter=',')

In [16]:
print train_features.shape
train_features = train_features[1:] # Ignoring the header
train_features[0]

(615513, 17)


array([  9.51012300e+06,   9.50211400e+06,   1.00000000e+00,
         2.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         2.00000000e+00,   5.88235294e-02,   5.13898342e-01,
         1.26084128e-01,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         2.44000000e+02,   2.00000000e+00])

## Adding citation features:

In [17]:
avg_citation_in_source = np.genfromtxt('avg_citation_in_source.txt')
avg_citation_in_target = np.genfromtxt('avg_citation_in_target.txt')
avg_citation_out_source = np.genfromtxt('avg_citation_out_source.txt')
avg_citation_out_target = np.genfromtxt('avg_citation_out_target.txt')

In [18]:
train_features_new = np.array([avg_citation_in_source, avg_citation_in_target, avg_citation_out_source, avg_citation_out_target])

In [19]:
train_features_new.shape

(4, 615512)

In [20]:
train_features = np.concatenate((train_features, train_features_new.T), axis=1)

In [21]:
train_features.shape

(615512, 21)

In [22]:
X = train_features[:,3:]
y = train_features[:,2]
y = np.array(y, dtype=int)

In [23]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2, f_classif, mutual_info_classif

In [24]:
# selection = SelectKBest(mutual_info_classif, k=4)
# X_new = selection.fit_transform(X, y)
# print selection.get_support()
# print selection.scores_

In [25]:
# np.argsort(selection.scores_)

In [26]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.33, random_state=42)

## RandomForest

In [58]:
clf = RandomForestClassifier(n_estimators=500, max_depth=10, max_features=7, n_jobs=4)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=7, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=4, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [59]:
y_train_prediction=clf.predict(X_train)

In [60]:
f1score = f1_score(y_train, y_train_prediction)
acc = accuracy_score(y_train, y_train_prediction)
print 'F1-score: ', f1score
print 'Accuracy: ', acc

F1-score:  0.977038714761
Accuracy:  0.975195990233


In [61]:
y_val_pred = clf.predict(X_val)
f1score = f1_score(y_val, y_val_pred)
acc = accuracy_score(y_val, y_val_pred)
print 'F1-score: ', f1score
print 'Accuracy: ', acc

F1-score:  0.975585564271
Accuracy:  0.973581988883


## AdaBoost

In [74]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4),
                         algorithm="SAMME.R",
                         n_estimators=200,learning_rate=0.2,random_state=42)

In [75]:
%time
bdt.fit(X_train, y_train)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.06 µs


AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=0.2, n_estimators=200, random_state=42)

In [76]:
y_train_prediction=bdt.predict(X_train)

In [77]:
f1score = f1_score(y_train, y_train_prediction)
acc = accuracy_score(y_train, y_train_prediction)
print 'F1-score: ', f1score
print 'Accuracy: ', acc

F1-score:  0.977340864702
Accuracy:  0.975496674289


In [78]:
y_val_pred = bdt.predict(X_val)
f1score = f1_score(y_val, y_val_pred)
acc = accuracy_score(y_val, y_val_pred)
print 'F1-score: ', f1score
print 'Accuracy: ', acc

F1-score:  0.976445999364
Accuracy:  0.974478015351


In [88]:
n_estimators = [300,500]
max_depth = [10,12]
max_features=[8,10]
best_random = None
best_params = {}
best_f1 = 0.0

for n_e in n_estimators:
    for d in max_depth:
        for n in max_features:
            
            print "Parameters: n_estimator: ",n_e,  " Max depth of tree: ",d , " Max features: ",n
            
            random_clf = RandomForestClassifier(n_estimators = n_e, max_depth = d , max_features=n)
            
            temp_clf = random_clf.fit(X_train, y_train)
            y_train_prediction = temp_clf.predict(X_train)
            f1score_train = f1_score(y_train, y_train_prediction)
            print 'Train F1-score: ', f1score_train,
            
            y_val_pred = temp_clf.predict(X_val)
            f1score_validation = f1_score(y_val, y_val_pred)
            print '\tValidation F1-score: ', f1score_validation
            
            if f1score_validation > best_f1:
                best_f1 = f1score_validation
                best_random = temp_clf
                best_params['n_estimator'] = n_e
                best_params['max_depth'] = d
                best_params['max_features'] = n
                print "Classifier is: ", best_random
                print "Score improved with params: ", best_params

print best_random
print best_params
print best_f1

Parameters: n_estimator:  300  Max depth of tree:  10  Max features:  8
Train F1-score:  0.977081589183 	Validation F1-score:  0.975597994705
Classifier is:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=8, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Score improved with params:  {'max_features': 8, 'max_depth': 10, 'n_estimator': 300}
Parameters: n_estimator:  300  Max depth of tree:  10  Max features:  10
Train F1-score:  0.977402905213 	Validation F1-score:  0.975602432933
Classifier is:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=10, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_sample

### Running for test data 

In [89]:
test_features = np.genfromtxt('features_test_unscaled_authors.csv', delimiter=',')
print test_features.shape

(32649, 16)


In [90]:
test_features = test_features[1:] # Ignoring the header
print test_features.shape

(32648, 16)


In [91]:
avg_citation_in_source_test = np.genfromtxt('avg_citation_in_source_test.txt')
avg_citation_in_target_test = np.genfromtxt('avg_citation_in_target_test.txt')
avg_citation_out_source_test = np.genfromtxt('avg_citation_out_source_test.txt')
avg_citation_out_target_test = np.genfromtxt('avg_citation_out_target_test.txt')

In [92]:
test_features_new = np.array([avg_citation_in_source_test, avg_citation_in_target_test, avg_citation_out_source_test, avg_citation_out_target_test])

In [93]:
test_features_new.shape

(4, 32648)

In [94]:
test_features = np.concatenate((test_features, test_features_new.T), axis=1)

In [95]:
print test_features.shape

(32648, 20)


In [96]:
X_test = test_features[:,2:]
y_test_pred = best_random.predict(X_test)

In [97]:
test_predictions_random = zip(range(len(y_test_pred)), y_test_pred)

with open("rf_authors_extended_grid.csv","wb") as pred1:
    csv_out = csv.writer(pred1)
    csv_out.writerow(('ID','category'))
    i = -1
    for row in test_predictions_random:
        i += 1
        if X_test[i][1] < 0:
            hardcode = (i, str(0))
            csv_out.writerow(hardcode)
            continue
        csv_out.writerow(row)

In [None]:
test_predictions_random = zip(range(len(y_test_pred)), y_test_pred)

with open("adaboost_authors.csv","wb") as pred1:
    csv_out = csv.writer(pred1)
    csv_out.writerow(('ID','category'))
    i = -1
    for row in test_predictions_random:
        i += 1
        if X_test[i][1] < 0:
            hardcode = (i, str(0))
            csv_out.writerow(hardcode)
            continue
        csv_out.writerow(row)

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

In [27]:
training_features_scaled = preprocessing.scale(X)
X_train, X_val, y_train, y_val = train_test_split(
    training_features_scaled, y, test_size=0.33, random_state=42)


In [29]:
bdt.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=0.2, n_estimators=400, random_state=42)

In [30]:
y_train_prediction=bdt.predict(X_train)

In [31]:
f1score = f1_score(y_train, y_train_prediction)
acc = accuracy_score(y_train, y_train_prediction)
print 'F1-score: ', f1score
print 'Accuracy: ', acc
y_val_pred = bdt.predict(X_val)
f1score = f1_score(y_val, y_val_pred)
acc = accuracy_score(y_val, y_val_pred)
print 'F1-score: ', f1score
print 'Accuracy: ', acc

F1-score:  0.974799945261
Accuracy:  0.972761419326
F1-score:  0.974871494212
Accuracy:  0.972779503641


In [14]:
test_features = np.genfromtxt('features_test_unscaled_authors.csv', delimiter=',')

In [15]:
print test_features.shape
test_features = test_features[1:] # Ignoring the header
test_features[10]
print test_features.shape

(32649, 16)
(32648, 16)


In [16]:
X_test = test_features[:,2:]

In [17]:
y_test_pred = bdt.predict(X_test)

In [18]:
test_predictions_random = zip(range(len(y_test_pred)), y_test_pred)

with open("adaboost_authors.csv","wb") as pred1:
    csv_out = csv.writer(pred1)
    csv_out.writerow(('ID','category'))
    i = -1
    for row in test_predictions_random:
        i += 1
        if X_test[i][1] < 0:
            hardcode = (i, str(0))
            csv_out.writerow(hardcode)
            continue
        csv_out.writerow(row)

In [None]:
print X_test.shape