In [7]:
import pandas as pd
from datetime import datetime

In [8]:
data_path = './dataset/'

def load_change_metrics_df(cur_proj):
    if cur_proj == 'qt':
        start = 1308350292
        end = 1395090476
    elif cur_proj == 'openstack':
        start = 1322599384
        end = 1393590700
    change_metrics = pd.read_csv(data_path+cur_proj+'_metrics.csv')
    
    change_metrics = change_metrics[(change_metrics['author_date'] >= start) & 
                                    (change_metrics['author_date'] <= end)]
    
    change_metrics['self'] = [1 if s is True else 0 for s in change_metrics['self']]
    change_metrics['defect'] = change_metrics['bugcount'] > 0
#     change_metrics['new_date'] = change_metrics['author_date'].apply(lambda x: datetime.fromtimestamp(x).strftime('%m-%d-%Y'))
    change_metrics['new_date'] = change_metrics['author_date'].apply(lambda x: datetime.fromtimestamp(x))
    
    change_metrics = change_metrics.sort_values(by='new_date')
    change_metrics['new_date'] = change_metrics['new_date'].apply(lambda x: x.strftime('%m-%d-%Y'))
    
    change_metrics['rtime'] = (change_metrics['rtime']/3600)/24
    change_metrics['age'] = (change_metrics['age']/3600)/24
    change_metrics = change_metrics.reset_index()
    change_metrics = change_metrics.set_index('commit_id')
    
    bug_label = change_metrics['defect']
    
    # use commit_id as index (so don't remove it)
    change_metrics = change_metrics.drop(['bugcount','fixcount','revd','tcmt','oexp','orexp','osexp','osawr','defect']
                                         ,axis=1)
    change_metrics = change_metrics.fillna(value=0)
    
    
    return change_metrics, bug_label

def split_train_test_data(feature_df, label, percent_split = 70):
    _p_percent_len = int(len(feature_df)*(percent_split/100))
    x_train = feature_df.iloc[:_p_percent_len]
    y_train = label.iloc[:_p_percent_len]
    
    x_test = feature_df.iloc[_p_percent_len:]
    y_test = label.iloc[_p_percent_len:]
    
    return x_train, x_test, y_train, y_test


In [11]:
change_metrics, bug_label = load_change_metrics_df('qt') 
x_train, x_test, y_train, y_test = split_train_test_data(change_metrics, bug_label, percent_split = 70)

In [13]:
print(x_train)

                                          index  author_date   la  ld  nf  nd  \
commit_id                                                                       
8aa4ee14cecaa0dbcb3513f57bdc5ba8e22fe4db  17611   1308350292   16   0   1   1   
2cf935b43e41c6589159536652412dab443ff1f8   5764   1308385331   29  16   3   1   
be6b251b7ba443875a37ad77cb5a6dce7aa3b93d  24058   1308385435  182   0  11   1   
665d8a045455214d2cfca72076da6bc75d3058c7  13093   1308407972    0  17   3   2   
451f3b3785ea5d08f5092978b6ebe17f25ef7a88   8845   1308408833  154  13   7   2   
...                                         ...          ...  ...  ..  ..  ..   
dc88c92f7beaa4bf4252e5dc4d18e328c7b9e165  27839   1367991151   61  65   2   1   
9712236e07667c453cc88b45830bb692415c57ae  19177   1367991165   22  26   2   1   
929e08a3d6bbc4ea16be69b732a3df5191e66474  18596   1367994273    8   8   1   1   
ca67b311d7723fd045508c2d244ea83064bb15a8  25548   1367997717   28  28   1   1   
649fa0b004036281327ebce78e00

In [12]:
print(x_test)

                                          index  author_date   la   ld  nf  \
commit_id                                                                    
00556b20067c3a4adf6ff33a17d2a4232fdce6ee     38   1367999716    1    1   1   
c74e4a74ba97d32df7406fb684527d415dd8a6ba  25153   1368001455   47    7   4   
5af870b2459afc9cc934d9e79e80e2e49ff75049  11656   1368001757    2    2   1   
5a83f73ac92a73c76ab7e26e60deba905dc3f64a  11593   1368002155   14   10   2   
d9d0a944495208635688ca402fe04860e81490f6  27513   1368003082   12    2   1   
...                                         ...          ...  ...  ...  ..   
04edeafade9058bde6d6dd58e6b1a89bace2fd50    657   1395077890   30    4   1   
ff3dcc49c4a1912189091e35e87cb61af2f62d47  32071   1395080509    1    0   1   
13171e7e63bc5199d783e33decf7f402019d05cc   2504   1395083885    3    0   1   
fa24ef3d721a7b94d0c5abbc6c9558e74bdb0f3d  31441   1395087964  130  130   7   
33bd1e08d0043e9b1340898039562bdf595879b5   6604   1395090475   1