In [1]:
import pandas
import pickle
import random
import numpy as np
import math

from custom_classifiers import LDAClassifier
from functools import reduce
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

In [2]:
def data_unpickle(path):
    with open(path, "rb") as reader:
        feature_data = pickle.load(reader)
    return feature_data

def data_pickle(path, data):
    with open(path, "wb") as writer:
        pickle.dump(data, writer)

class Maker_XY():
    def __init__(self):
        self.scaler = None
    
    def __call__(self, working_df, feature_lists):
        Y = working_df["BUNumber"].tolist()
        jiras = working_df["JiraID"].tolist()
        X = [feature_lists[str(jira)] for jira in jiras]
        if self.scaler is None:
            self.scaler = MinMaxScaler()
            X = self.scaler.fit_transform(X)
        else:
            self.scaler.partial_fit(X)
            self.scaler.transform(X)
        return np.array(X), np.array(Y), np.array(jiras)

def add_role(X, working_df):
    roles = working_df["Role"].tolist()
    unique_roles = list(set(roles))
    num_roles = np.array([unique_roles.index(role) for role in roles])
    return np.concatenate((X, num_roles.reshape(-1, 1)), axis=1)

def test_classification(X_test, Y_test, clf, threshold):
    right_guess = 0
    res_str = ""
    for i in range(X_test.shape[0]):
        prediction = clf.predict_proba(X_test[i, :].reshape(1, -1))[0]
        pred_class = sorted(zip(clf.classes_, prediction), key=lambda item: item[1], reverse=True)
        if Y_test[i] in [item[0] for item in pred_class[:threshold]]:
            right_guess += 1
        res_str += "Pred_class:{}\nReal class:{}\n".format(pred_class[:threshold], Y_test[i])
    print("Right guess {} from {}".format(right_guess, X_test.shape[0]))
    print("Accuracy:{}".format(right_guess/X_test.shape[0]))
    
    
class cut_log_loss_scorer:
    def __call__(self, estimator, X, y):
        loss = 0
        predictions = estimator.predict_proba(X)
        pred_class = dict(zip(estimator.classes_, range(predictions.shape[1])))
        for i in range(X.shape[0]):
            if y[i] in pred_class:
                p = predictions[i, pred_class[y[i]]]
            else:
                p = 0
            loss = math.log(p + 0.0001)
        return loss/X.shape[0]
    
def get_split_indices(arr, n_splits):
    split_indices = [[] for j in range(n_splits)]
    for i in range(arr.shape[0]):
        r = random.randint(0, n_splits - 1)
        split_indices[r].append(i)
    return split_indices
    
    
class WeightedRoleMetric:
    def __init__(self, w, length):
        self.w = w
        self.null = np.array([0 for i in range(length)])
    
    def __call__(self, a, b):
        c = a[:-1] - b[:-1]
        if not np.array_equal(a[-1], b[-1]):
            c *= self.w
        return minkowski(c, self.null, 2)
    
    
class BaseLineClassifier:
    def fit(self, Y_train):
        classes_probs = sorted([(item[0], item[1]/Y_train.shape[0]) for item in Counter(Y_train).items()], 
                                 key = lambda item: item[1], reverse=True)
        self.classes_ = [item[0] for item in classes_probs]
        self.probs = [[item[1] for item in classes_probs]]
        return self
    
    def predict_proba(self, x):
        return self.probs
    
# checked    
def prepare_matrix(probs, jiras, classes):
    res_jiras = reduce(lambda x, y: x + y, [j.tolist() for j in jiras])
    res_classes = sorted(list(set(reduce(lambda x, y: x + y, [c.tolist() for c in classes]))))
    res = []
    for i in range(len(probs)):
        for j in range(probs[i].shape[0]):
            loc_res = []
            for k in range(len(res_classes)):
                if res_classes[k] in classes[i].tolist():
                    loc_res.append(probs[i][j, classes[i].tolist().index(res_classes[k])])
                else:
                    loc_res.append(-1)
            res.append(loc_res)
    return res_jiras, res_classes, np.array(res)

In [3]:
df = pandas.read_csv("data/full_JiraID_BUNumber.csv")
df.shape

(307682, 10)

In [4]:
CUR_ROLE = "Developer"
SPLIT_NUM = 10
TOP = 5
GRID_SEARCH_PARAMETERS = {"lda__n_components": [5, 10, 20, 30]}

In [5]:
developer_feature_lists = data_unpickle("data/res8/developers_feature_lists.pickle")

In [6]:
df = df[df["JiraID"].isin(developer_feature_lists.keys())]
df.shape

(72171, 10)

In [7]:
cur_df = df[df["Timeline"] == "2017-07-01 00:00:00"][df["Role"] == CUR_ROLE][["JiraID", "Role", "BUNumber"]].drop_duplicates()
cur_df.shape

  """Entry point for launching an IPython kernel.


(2281, 3)

In [8]:
sup_df = df[df["Role"] == CUR_ROLE][df["BUNumber"].isin(set(cur_df["BUNumber"]))][["JiraID", "Role", "BUNumber"]].drop_duplicates()
sup_df.shape

  """Entry point for launching an IPython kernel.


(2896, 3)

In [9]:
split_indices = get_split_indices(cur_df, SPLIT_NUM)

In [10]:
maker_XY = Maker_XY()

In [11]:
cur_X, cur_Y, cur_jiras = maker_XY(cur_df, developer_feature_lists)
print(cur_X.shape)
print(cur_Y.shape)
print(cur_jiras.shape)

(2281, 1645)
(2281,)
(2281,)


In [12]:
res_probs = []
res_real = []
res_jiras = []
res_classes = []
for i in range(SPLIT_NUM):
    test_X = cur_X[split_indices[i], :]
    test_Y = cur_Y[split_indices[i]]
    test_jiras = cur_jiras[split_indices[i]]
    
    other_index = reduce(lambda x, y: x + y, [split_indices[j] for j in range(SPLIT_NUM) if j != i])
    
    train_X = cur_X[other_index, :]
    train_Y = cur_Y[other_index]
    
    sub_sup_df = sup_df[~sup_df["JiraID"].isin(test_jiras)]
    sub_sup_X, sub_sup_Y, _ = maker_XY(sub_sup_df, developer_feature_lists)
    
    train_joined_X = np.concatenate((train_X, sub_sup_X), axis=0)
    train_joined_Y = np.concatenate((train_Y, sub_sup_Y), axis=0)
    
    pipeline = Pipeline([("lda", LDAClassifier())])
    
    cv = GridSearchCV(pipeline, param_grid=GRID_SEARCH_PARAMETERS, n_jobs=8,
                     scoring=cut_log_loss_scorer())
    
    cv.fit(train_joined_X, train_joined_Y)
    
    print("CV parameters: ", cv.best_params_)

    baseline = BaseLineClassifier()
    baseline.fit(train_joined_Y)
    
    res_probs.append(cv.predict_proba(test_X))
    res_real.append(test_Y)
    res_jiras.append(test_jiras)
    res_classes.append(cv.classes_)
    
    print("Classifier:")
    test_classification(test_X, test_Y, cv, TOP)
    
    print("Baseline:")
    test_classification(test_X, test_Y, baseline, TOP)
    
    print("-----")

print(len(res_probs))
print(len(res_jiras))



CV parameters:  {'lda__n_components': 5}
Classifier:
Right guess 13 from 217
Accuracy:0.059907834101382486
Baseline:
Right guess 11 from 217
Accuracy:0.05069124423963134
-----




CV parameters:  {'lda__n_components': 10}
Classifier:
Right guess 11 from 228
Accuracy:0.04824561403508772
Baseline:
Right guess 17 from 228
Accuracy:0.07456140350877193
-----




CV parameters:  {'lda__n_components': 5}
Classifier:
Right guess 6 from 240
Accuracy:0.025
Baseline:
Right guess 23 from 240
Accuracy:0.09583333333333334
-----




CV parameters:  {'lda__n_components': 10}
Classifier:
Right guess 6 from 214
Accuracy:0.028037383177570093
Baseline:
Right guess 15 from 214
Accuracy:0.07009345794392523
-----




CV parameters:  {'lda__n_components': 5}
Classifier:
Right guess 9 from 224
Accuracy:0.04017857142857143
Baseline:
Right guess 16 from 224
Accuracy:0.07142857142857142
-----




CV parameters:  {'lda__n_components': 10}
Classifier:
Right guess 13 from 213
Accuracy:0.06103286384976526
Baseline:
Right guess 13 from 213
Accuracy:0.06103286384976526
-----




CV parameters:  {'lda__n_components': 10}
Classifier:
Right guess 10 from 236
Accuracy:0.0423728813559322
Baseline:
Right guess 17 from 236
Accuracy:0.07203389830508475
-----




CV parameters:  {'lda__n_components': 10}
Classifier:
Right guess 12 from 259
Accuracy:0.04633204633204633
Baseline:
Right guess 19 from 259
Accuracy:0.07335907335907337
-----




CV parameters:  {'lda__n_components': 10}
Classifier:
Right guess 8 from 227
Accuracy:0.03524229074889868
Baseline:
Right guess 9 from 227
Accuracy:0.039647577092511016
-----




CV parameters:  {'lda__n_components': 5}
Classifier:
Right guess 9 from 223
Accuracy:0.04035874439461883
Baseline:
Right guess 17 from 223
Accuracy:0.07623318385650224
-----
10
10


In [13]:
fin_res_jiras, fin_res_classes, fin_res = prepare_matrix(res_probs, res_jiras, res_classes)
print(len(fin_res_jiras))
print(len(fin_res_classes))
print(fin_res.shape)

2281
335
(2281, 335)


In [14]:
data_pickle("data/res8/lda/fin_res_jiras1.pickle", fin_res_jiras)
data_pickle("data/res8/lda/fin_res_classes1.pickle", fin_res_classes)
data_pickle("data/res8/lda/fin_res1.pickle", fin_res)

In [15]:
fin_res_real = reduce(lambda x, y: x + y, [r_r.tolist() for r_r in res_real])

In [16]:
len(fin_res_real)

2281

In [17]:
data_pickle("data/res8/lda/fin_res_real1.pickle", fin_res_real)

In [18]:
fin_res_df = pandas.DataFrame.from_records(data=fin_res, index=fin_res_jiras, columns=fin_res_classes)

In [19]:
fin_res_df.to_csv("data/res8/lda/fin_res_df1.csv")

In [20]:
fin_res_df.shape

(2281, 335)