In [1]:
import pickle
import pandas
import random
import numpy as np
import math


from functools import reduce
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [2]:
def data_unpickle(path):
    with open(path, "rb") as reader:
        feature_data = pickle.load(reader)
    return feature_data

def data_pickle(path, data):
    with open(path, "wb") as writer:
        pickle.dump(data, writer)

class Maker_XY():
    def __init__(self):
        self.scaler = None
    
    def __call__(self, working_df, feature_lists):
        Y = working_df["BUNumber"].tolist()
        jiras = working_df["JiraID"].tolist()
        X = [feature_lists[str(jira)] for jira in jiras]
        if self.scaler is None:
            self.scaler = MinMaxScaler()
            X = self.scaler.fit_transform(X)
        else:
            self.scaler.partial_fit(X)
            self.scaler.transform(X)
        return np.array(X), np.array(Y), np.array(jiras)


def add_role(X, working_df):
    roles = working_df["Role"].tolist()
    unique_roles = list(set(roles))
    num_roles = np.array([unique_roles.index(role) for role in roles])
    return np.concatenate((X, num_roles.reshape(-1, 1)), axis=1)

def test_classification(X_test, Y_test, clf, threshold):
    right_guess = 0
    res_str = ""
    for i in range(X_test.shape[0]):
        prediction = clf.predict_proba(X_test[i, :].reshape(1, -1))[0]
        pred_class = sorted(zip(clf.classes_, prediction), key=lambda item: item[1], reverse=True)
        if Y_test[i] in [item[0] for item in pred_class[:threshold]]:
            right_guess += 1
        res_str += "Pred_class:{}\nReal class:{}\n".format(pred_class[:threshold], Y_test[i])
    print("Right guess {} from {}".format(right_guess, X_test.shape[0]))
    print("Accuracy:{}".format(right_guess/X_test.shape[0]))
    
    
class cut_log_loss_scorer:
    def __call__(self, estimator, X, y):
        loss = 0
        predictions = estimator.predict_proba(X)
        pred_class = dict(zip(estimator.classes_, range(predictions.shape[1])))
        for i in range(X.shape[0]):
            if y[i] in pred_class:
                p = predictions[i, pred_class[y[i]]]
            else:
                p = 0
            loss = math.log(p + 0.0001)
        return loss/X.shape[0]
    
def get_split_indices(arr, n_splits):
    split_indices = [[] for j in range(n_splits)]
    for i in range(arr.shape[0]):
        r = random.randint(0, n_splits - 1)
        split_indices[r].append(i)
    return split_indices
    
    
class WeightedRoleMetric:
    def __init__(self, w, length):
        self.w = w
        self.null = np.array([0 for i in range(length)])
    
    def __call__(self, a, b):
        c = a[:-1] - b[:-1]
        if not np.array_equal(a[-1], b[-1]):
            c *= self.w
        return minkowski(c, self.null, 2)
    
    
class BaseLineClassifier:
    def fit(self, Y_train):
        classes_probs = sorted([(item[0], item[1]/Y_train.shape[0]) for item in Counter(Y_train).items()], 
                                 key = lambda item: item[1], reverse=True)
        self.classes_ = [item[0] for item in classes_probs]
        self.probs = [[item[1] for item in classes_probs]]
        return self
    
    def predict_proba(self, x):
        return self.probs
    
    
def prepare_matrix(probs, jiras, classes):
    res_jiras = reduce(lambda x, y: x + y, [j.tolist() for j in jiras])
    res_classes = sorted(list(set(reduce(lambda x, y: x + y, [c.tolist() for c in classes]))))
    res = []
    for i in range(len(probs)):
        for j in range(probs[i].shape[0]):
            loc_res = []
            for k in range(len(res_classes)):
                if res_classes[k] in classes[i].tolist():
                    loc_res.append(probs[i][j, classes[i].tolist().index(res_classes[k])])
                else:
                    loc_res.append(-1)
            res.append(loc_res)
    return res_jiras, res_classes, np.array(res)

In [3]:
df = pandas.read_csv("data/full_JiraID_BUNumber.csv")
df.shape

(307682, 10)

In [4]:
CUR_ROLE = "Developer"
SPLIT_NUM = 10
N_NEIGHBORS = 10
TOP = 5
GRID_SEARCH_PARAMETERS = {"lda__n_components": [5, 10, 20, 30, 40, 50, 100],
                          "knn__n_neighbors": [10, 20], 
                          "knn__weights": ["distance"], 
                          "knn__metric": ["minkowski", "manhattan"],
                          "knn__p": [1, 2]}

In [5]:
developer_feature_lists = data_unpickle("data/res8/developers_feature_lists.pickle")

In [6]:
df = df[df["JiraID"].isin(developer_feature_lists.keys())]
df.shape

(72171, 10)

In [7]:
cur_df = df[df["Timeline"] == "2017-07-01 00:00:00"][df["Role"] == CUR_ROLE][["JiraID", "Role", "BUNumber"]].drop_duplicates()
cur_df.shape

  """Entry point for launching an IPython kernel.


(2281, 3)

In [8]:
sup_df = df[df["Role"] == CUR_ROLE][df["BUNumber"].isin(set(cur_df["BUNumber"]))][["JiraID", "Role", "BUNumber"]].drop_duplicates()
sup_df.shape

  """Entry point for launching an IPython kernel.


(2896, 3)

In [9]:
cur_df[["BUNumber", "JiraID"]].groupby("BUNumber").count().sort_values("JiraID", ascending=False)

Unnamed: 0_level_0,JiraID
BUNumber,Unnamed: 1_level_1
130,68
878,34
603,33
850,33
915,32
1219,30
852,30
267,30
858,29
857,29


In [10]:
DISBAND_BUNUMBER = 130

In [11]:
test_df = cur_df[cur_df["BUNumber"] == DISBAND_BUNUMBER]
test_df.shape

(68, 3)

In [12]:
train_df = cur_df[cur_df["BUNumber"] != DISBAND_BUNUMBER]
train_df.shape

(2213, 3)

In [13]:
maker_XY = Maker_XY()

In [14]:
train_X, train_Y, train_jiras = maker_XY(train_df, developer_feature_lists)
print(train_X.shape)
print(train_Y.shape)
print(train_jiras.shape)

(2213, 1645)
(2213,)
(2213,)


In [15]:
test_X, test_Y, test_jiras = maker_XY(test_df, developer_feature_lists)
print(test_X.shape)
print(test_Y.shape)
print(test_jiras.shape)

(68, 1645)
(68,)
(68,)


In [16]:
sub_sup_df = sup_df[~sup_df["JiraID"].isin(test_df["JiraID"])][sup_df["BUNumber"] != DISBAND_BUNUMBER]
sub_sup_df.shape

  """Entry point for launching an IPython kernel.


(2820, 3)

In [17]:
sub_sup_X, sub_sup_Y, _ = maker_XY(sub_sup_df, developer_feature_lists)
print(sub_sup_X.shape)
print(sub_sup_Y.shape)

(2820, 1645)
(2820,)


In [18]:
train_joined_X = np.concatenate((train_X, sub_sup_X), axis=0)
train_joined_Y = np.concatenate((train_Y, sub_sup_Y), axis=0)
print(train_joined_X.shape)
print(train_joined_Y.shape)

(5033, 1645)
(5033,)


In [19]:
pipeline = Pipeline([("lda", LatentDirichletAllocation()), ("knn", KNeighborsClassifier())])
cv = GridSearchCV(pipeline, param_grid=GRID_SEARCH_PARAMETERS, n_jobs=8,
                 scoring=cut_log_loss_scorer())    
cv.fit(train_joined_X, train_joined_Y)



GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('lda', LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
           ...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'lda__n_components': [5, 10, 20, 30, 40, 50, 100], 'knn__n_neighbors': [10, 20], 'knn__weights': ['distance'], 'knn__metric': ['minkowski', 'manhattan'], 'knn__p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<__main__.cut_log_loss_scorer object at 0x7f8738d2cc18>,
       verbose=0)

In [20]:
print("CV parameters: ", cv.best_params_)

CV parameters:  {'knn__metric': 'minkowski', 'knn__n_neighbors': 10, 'knn__p': 2, 'knn__weights': 'distance', 'lda__n_components': 5}


In [21]:
res_probs = [cv.predict_proba(test_X)]
res_real = [test_Y]
res_jiras = [test_jiras]
res_classes = [cv.classes_]

In [22]:
print(res_probs[0].shape)

(68, 334)


In [23]:
print(res_real[0].shape)

(68,)


In [24]:
print(res_jiras[0].shape)

(68,)


In [25]:
print(res_classes[0].shape)

(334,)


In [26]:
fin_res_jiras, fin_res_classes, fin_res = prepare_matrix(res_probs, res_jiras, res_classes)
print(len(fin_res_jiras))
print(len(fin_res_classes))
print(fin_res.shape)

68
334
(68, 334)


In [27]:
data_pickle("data/res8/knn_lda/fin_res_jiras2.pickle", fin_res_jiras)
data_pickle("data/res8/knn_lda/fin_res_classes2.pickle", fin_res_classes)
data_pickle("data/res8/knn_lda/fin_res2.pickle", fin_res)

In [28]:
fin_res_real = reduce(lambda x, y: x + y, [r_r.tolist() for r_r in res_real])
print(len(fin_res_real))

68


In [29]:
data_pickle("data/res8/knn_lda/fin_res_real2.pickle", fin_res_real)

In [30]:
fin_res_df = pandas.DataFrame.from_records(data=fin_res, index=fin_res_jiras, columns=fin_res_classes)

In [31]:
fin_res_df.to_csv("data/res8/knn_lda/fin_res_df2.csv")

In [32]:
fin_res_df.shape

(68, 334)