In [1]:
import pickle
import pandas
import random
import math
import numpy as np


from functools import reduce
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [2]:
def data_unpickle(path):
    with open(path, "rb") as reader:
        feature_data = pickle.load(reader)
    return feature_data

def data_pickle(path, data):
    with open(path, "wb") as writer:
        pickle.dump(data, writer)

class Maker_XY():
    def __init__(self):
        self.scaler = None
    
    def __call__(self, working_df, feature_lists):
        Y = working_df["BUNumber"].tolist()
        jiras = working_df["JiraID"].tolist()
        X = [feature_lists[str(jira)] for jira in jiras]
        if self.scaler is None:
            self.scaler = MinMaxScaler()
            X = self.scaler.fit_transform(X)
        else:
            self.scaler.partial_fit(X)
            self.scaler.transform(X)
        return np.array(X), np.array(Y), np.array(jiras)

def add_role(X, working_df):
    roles = working_df["Role"].tolist()
    unique_roles = list(set(roles))
    num_roles = np.array([unique_roles.index(role) for role in roles])
    return np.concatenate((X, num_roles.reshape(-1, 1)), axis=1)

def test_classification(X_test, Y_test, clf, threshold):
    right_guess = 0
    res_str = ""
    for i in range(X_test.shape[0]):
        prediction = clf.predict_proba(X_test[i, :].reshape(1, -1))[0]
        pred_class = sorted(zip(clf.classes_, prediction), key=lambda item: item[1], reverse=True)
        if Y_test[i] in [item[0] for item in pred_class[:threshold]]:
            right_guess += 1
        res_str += "Pred_class:{}\nReal class:{}\n".format(pred_class[:threshold], Y_test[i])
    print("Right guess {} from {}".format(right_guess, X_test.shape[0]))
    print("Accuracy:{}".format(right_guess/X_test.shape[0]))
    
class cut_log_loss_scorer:
    def __call__(self, estimator, X, y):
        loss = 0
        predictions = estimator.predict_proba(X)
        pred_class = dict(zip(estimator.classes_, range(predictions.shape[1])))
        for i in range(X.shape[0]):
            if y[i] in pred_class:
                p = predictions[i, pred_class[y[i]]]
            else:
                p = 0
            loss = math.log(p + 0.0001)
        return loss/X.shape[0]
    
def get_split_indices(arr, n_splits):
    split_indices = [[] for j in range(n_splits)]
    for i in range(arr.shape[0]):
        r = random.randint(0, n_splits - 1)
        split_indices[r].append(i)
    return split_indices
    
    
class WeightedRoleMetric:
    def __init__(self, w, length):
        self.w = w
        self.null = np.array([0 for i in range(length)])
    
    def __call__(self, a, b):
        c = a[:-1] - b[:-1]
        if not np.array_equal(a[-1], b[-1]):
            c *= self.w
        return minkowski(c, self.null, 2)
    
    
class BaseLineClassifier:
    def fit(self, Y_train):
        classes_probs = sorted([(item[0], item[1]/Y_train.shape[0]) for item in Counter(Y_train).items()], 
                                 key = lambda item: item[1], reverse=True)
        self.classes_ = [item[0] for item in classes_probs]
        self.probs = [[item[1] for item in classes_probs]]
        return self
    
    def predict_proba(self, x):
        return self.probs
    
# checked    
def prepare_matrix(probs, jiras, classes):
    res_jiras = reduce(lambda x, y: x + y, [j.tolist() for j in jiras])
    res_classes = sorted(list(set(reduce(lambda x, y: x + y, [c.tolist() for c in classes]))))
    res = []
    for i in range(len(probs)):
        for j in range(probs[i].shape[0]):
            loc_res = []
            for k in range(len(res_classes)):
                if res_classes[k] in classes[i].tolist():
                    loc_res.append(probs[i][j, classes[i].tolist().index(res_classes[k])])
                else:
                    loc_res.append(-1)
            res.append(loc_res)
    return res_jiras, res_classes, np.array(res)

In [3]:
df = pandas.read_csv("data/full_JiraID_BUNumber.csv")
df.shape

(307682, 10)

In [4]:
CUR_ROLE = "Developer"
SPLIT_NUM = 10
N_COMPONENTS = 500
N_NEIGHBORS = 10
TOP = 5
GRID_SEARCH_PARAMETERS = {"pca__n_components": [100, 200],
                          "knn__n_neighbors": [10, 20, 30], 
                          "knn__weights": ["distance"], 
                          "knn__metric": ["minkowski", "manhattan", "chebyshev"],
                          "knn__p": [1, 2]}

In [5]:
developer_feature_lists = data_unpickle("data/res8/developers_feature_lists.pickle")

In [6]:
df = df[df["JiraID"].isin(developer_feature_lists.keys())]
df.shape

(72171, 10)

In [7]:
cur_df = df[df["Timeline"] == "2017-07-01 00:00:00"][df["Role"] == CUR_ROLE][["JiraID", "Role", "BUNumber"]].drop_duplicates()
cur_df.shape

  """Entry point for launching an IPython kernel.


(2281, 3)

In [8]:
sup_df = df[df["Role"] == CUR_ROLE][df["BUNumber"].isin(set(cur_df["BUNumber"]))][["JiraID", "Role", "BUNumber"]].drop_duplicates()
sup_df.shape

  """Entry point for launching an IPython kernel.


(2896, 3)

In [9]:
split_indices = get_split_indices(cur_df, SPLIT_NUM)

In [10]:
maker_XY = Maker_XY()

In [11]:
cur_X, cur_Y, cur_jiras = maker_XY(cur_df, developer_feature_lists)
print(cur_X.shape)
print(cur_Y.shape)
print(cur_jiras.shape)

(2281, 1645)
(2281,)
(2281,)


In [12]:
res_probs = []
res_real = []
res_jiras = []
res_classes = []
for i in range(SPLIT_NUM):
    test_X = cur_X[split_indices[i], :]
    test_Y = cur_Y[split_indices[i]]
    test_jiras = cur_jiras[split_indices[i]]
    
    other_index = reduce(lambda x, y: x + y, [split_indices[j] for j in range(SPLIT_NUM) if j != i])
    
    train_X = cur_X[other_index, :]
    train_Y = cur_Y[other_index]
    
    sub_sup_df = sup_df[~sup_df["JiraID"].isin(test_jiras)]
    sub_sup_X, sub_sup_Y, _ = maker_XY(sub_sup_df, developer_feature_lists)
    
    train_joined_X = np.concatenate((train_X, sub_sup_X), axis=0)
    train_joined_Y = np.concatenate((train_Y, sub_sup_Y), axis=0)
    
    pipeline = Pipeline([("pca", PCA()), ("knn", KNeighborsClassifier())])
    
    cv = GridSearchCV(pipeline, param_grid=GRID_SEARCH_PARAMETERS, 
                      scoring=cut_log_loss_scorer(), n_jobs=8)
    
    cv.fit(train_joined_X, train_joined_Y)
    
    print("CV parameters: ", cv.best_params_)

    baseline = BaseLineClassifier()
    baseline.fit(train_joined_Y)
    
    res_probs.append(cv.predict_proba(test_X))
    res_real.append(test_Y)
    res_jiras.append(test_jiras)
    res_classes.append(cv.classes_)
    
    print("Classifier:")
    test_classification(test_X, test_Y, cv, TOP)
    
    print("Baseline:")
    test_classification(test_X, test_Y, baseline, TOP)
    
    print("-----")

print(len(res_probs))
print(len(res_jiras))



CV parameters:  {'knn__metric': 'manhattan', 'knn__n_neighbors': 30, 'knn__p': 1, 'knn__weights': 'distance', 'pca__n_components': 200}
Classifier:
Right guess 25 from 206
Accuracy:0.12135922330097088
Baseline:
Right guess 11 from 206
Accuracy:0.05339805825242718
-----




CV parameters:  {'knn__metric': 'manhattan', 'knn__n_neighbors': 10, 'knn__p': 1, 'knn__weights': 'distance', 'pca__n_components': 200}
Classifier:
Right guess 16 from 232
Accuracy:0.06896551724137931
Baseline:
Right guess 16 from 232
Accuracy:0.06896551724137931
-----




CV parameters:  {'knn__metric': 'manhattan', 'knn__n_neighbors': 20, 'knn__p': 2, 'knn__weights': 'distance', 'pca__n_components': 200}
Classifier:
Right guess 31 from 231
Accuracy:0.1341991341991342
Baseline:
Right guess 16 from 231
Accuracy:0.06926406926406926
-----




CV parameters:  {'knn__metric': 'manhattan', 'knn__n_neighbors': 30, 'knn__p': 1, 'knn__weights': 'distance', 'pca__n_components': 200}
Classifier:
Right guess 28 from 237
Accuracy:0.11814345991561181
Baseline:
Right guess 21 from 237
Accuracy:0.08860759493670886
-----




CV parameters:  {'knn__metric': 'manhattan', 'knn__n_neighbors': 20, 'knn__p': 2, 'knn__weights': 'distance', 'pca__n_components': 200}
Classifier:
Right guess 25 from 209
Accuracy:0.11961722488038277
Baseline:
Right guess 17 from 209
Accuracy:0.08133971291866028
-----




CV parameters:  {'knn__metric': 'manhattan', 'knn__n_neighbors': 20, 'knn__p': 2, 'knn__weights': 'distance', 'pca__n_components': 200}
Classifier:
Right guess 15 from 231
Accuracy:0.06493506493506493
Baseline:
Right guess 19 from 231
Accuracy:0.08225108225108226
-----




CV parameters:  {'knn__metric': 'manhattan', 'knn__n_neighbors': 20, 'knn__p': 1, 'knn__weights': 'distance', 'pca__n_components': 200}
Classifier:
Right guess 27 from 253
Accuracy:0.1067193675889328
Baseline:
Right guess 21 from 253
Accuracy:0.08300395256916997
-----




CV parameters:  {'knn__metric': 'minkowski', 'knn__n_neighbors': 20, 'knn__p': 1, 'knn__weights': 'distance', 'pca__n_components': 200}
Classifier:
Right guess 28 from 221
Accuracy:0.12669683257918551
Baseline:
Right guess 18 from 221
Accuracy:0.08144796380090498
-----




CV parameters:  {'knn__metric': 'manhattan', 'knn__n_neighbors': 30, 'knn__p': 1, 'knn__weights': 'distance', 'pca__n_components': 200}
Classifier:
Right guess 27 from 226
Accuracy:0.11946902654867257
Baseline:
Right guess 14 from 226
Accuracy:0.061946902654867256
-----




CV parameters:  {'knn__metric': 'chebyshev', 'knn__n_neighbors': 20, 'knn__p': 1, 'knn__weights': 'distance', 'pca__n_components': 200}
Classifier:
Right guess 24 from 235
Accuracy:0.10212765957446808
Baseline:
Right guess 9 from 235
Accuracy:0.03829787234042553
-----
10
10


In [21]:
fin_res_jiras, fin_res_classes, fin_res = prepare_matrix(res_probs, res_jiras, res_classes)
print(len(fin_res_jiras))
print(len(fin_res_classes))
print(fin_res.shape)

2281
335
(2281, 335)


In [22]:
data_pickle("data/res8/knn_pca/fin_res_jiras1.pickle", fin_res_jiras)
data_pickle("data/res8/knn_pca/fin_res_classes1.pickle", fin_res_classes)
data_pickle("data/res8/knn_pca/fin_res1.pickle", fin_res)

In [23]:
fin_res_real = reduce(lambda x, y: x + y, [r_r.tolist() for r_r in res_real])

In [24]:
len(fin_res_real)

2281

In [25]:
data_pickle("data/res8/knn_pca/fin_res_real1.pickle", fin_res_real)

In [26]:
fin_res_df = pandas.DataFrame.from_records(data=fin_res, index=fin_res_jiras, columns=fin_res_classes)

In [27]:
fin_res_df.to_csv("data/res8/knn_pca/fin_res_df1.csv")

In [28]:
fin_res_df.shape

(2281, 335)

In [None]:
# checking
tmp_res_probs = [
                    np.array([
                        [1, 2, 3],
                        [4, 5, 6],
                        [7, 8, 9]
                    ]),
                    np.array([
                        [10, 11, 12, 13],
                        [14, 15, 16, 17],
                        [18, 19, 20, 21]
                    ]),
                    np.array([
                        [19, 20, 21],
                        [22, 23, 24]
                    ])
                ]
tmp_res_jiras = [
                    np.array([1, 2, 3]),
                    np.array([4, 5, 6]),
                    np.array([7, 8])
                ]
tmp_res_classes = [
                     np.array([101, 103, 102]),
                     np.array([103, 105, 101, 106]),
                     np.array([100, 101, 102])
                 ]
fin_res_jiras, fin_res_classes, fin_res = prepare_matrix(tmp_res_probs, tmp_res_jiras, tmp_res_classes)
print(fin_res_jiras)
print(fin_res_classes)
print(fin_res)

In [None]:
len(fin_res_classes)