In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
%matplotlib inline

In [3]:
file_path=r'C:\Users\bhavrang\Documents\Analytics vidhya\GlobalDataChallenge-II\global_data_science_challenge_2_public-master/data/gdsc2_public.csv'
df=pd.read_csv(file_path, sep=';', parse_dates=['timestamp'])
df=df.sort_values(by=['work_item','timestamp'])

In [4]:
df['from_phase'].fillna('Start',inplace=True)
df['to_phase'].fillna('End',inplace=True)
df.components.fillna('', inplace=True)

In [5]:
start_times=df[df['from_phase']=='Start'][['work_item','timestamp']]
end_times=df[df['to_phase']=='End'][['work_item','timestamp']]
times=pd.merge(start_times,end_times,on='work_item',how='left')
times['duration']=times['timestamp_y']-times['timestamp_x']
times['duration_in_days'] = times['duration'].apply(lambda x: round(x.total_seconds() / (24*3600), 2))
times.rename(columns={'timestamp_x': 'start', 'timestamp_y': 'end'}, inplace=True)
times.head()

Unnamed: 0,work_item,start,end,duration,duration_in_days
0,WI_000001,2015-01-02 14:39:14,2015-01-27 11:36:51,24 days 20:57:37,24.87
1,WI_000002,2015-01-02 15:04:20,2015-01-14 09:46:37,11 days 18:42:17,11.78
2,WI_000003,2015-01-02 15:28:22,2015-02-26 11:50:37,54 days 20:22:15,54.85
3,WI_000004,2015-01-02 15:33:54,2015-01-28 09:11:05,25 days 17:37:11,25.73
4,WI_000005,2015-01-02 16:32:11,2015-02-04 12:57:49,32 days 20:25:38,32.85


In [6]:
open_per_day=times.resample('D',on='start')['work_item'].count().rename('open_tickets_per_day')
is_closed = times['end'].notnull()
closed_per_day = times.loc[is_closed].resample('D', on='end')['work_item'].count().rename('closed_tickets_per_day')
tickets_df = (pd.concat([open_per_day, closed_per_day], axis=1) # Join the two dataframes
                .fillna(0)                                      # Replace NaNs by 0 for those days when no tickets are opened or closed
                .astype(int)                                    # While we're at it, all counts are integers
                .reset_index()                                  # timestamp_x is used as index, move it back to a column
                .rename(columns={'start': 'date'})        # and rename it to ‘date’
             )
tickets_df['open_tickets_total'] = tickets_df['open_tickets_per_day'].cumsum()
tickets_df['closed_tickets_total'] = tickets_df['closed_tickets_per_day'].cumsum()
tickets_df['wip_tickets_total'] = tickets_df['open_tickets_total'] - tickets_df['closed_tickets_total']

In [7]:
closed_times = times[times['duration'].notnull()]
open_times = times[times['duration'].isnull()]

In [8]:
sep_date_str = '01.01.2018'
sep_date = dt.datetime.strptime(sep_date_str, '%d.%m.%Y')
train_times = closed_times[closed_times.end <= sep_date]
test_times = closed_times[(closed_times.end > sep_date) & (closed_times.start <= sep_date)]

In [9]:
train_work_items = set(train_times.work_item)  
test_work_items = set(test_times.work_item)  
df_start_only = df[df.from_phase == 'Start']
train_df = df_start_only[df_start_only['work_item'].isin(train_work_items)]  
test_df = df_start_only[df_start_only['work_item'].isin(test_work_items)]

In [10]:
def rmsle(actuals: pd.DataFrame, predictions: pd.DataFrame) -> float:
    """
    Computes the root mean square log error between the actuals and predictions.
    Raises and error if there are multiple predictions for a single work item, or if there are missing predictions
    :param actuals: A DataFrame with the columns 'work_item' and 'duration_in_days'
    :param predictions: A DataFrame with the columns 'work_item' and 'predictions'
    :return: RMSLE between actuals and predictions
    """
    assert len(actuals) == len(predictions)
    assert set(actuals.work_item.values) == set(predictions.work_item.values)
    actuals_values = actuals.duration_in_days.values
    predictions_values = predictions.predictions.values
    rmsle = np.sqrt(sum(((np.log(actuals_values + 1) - np.log(predictions_values + 1)) ** 2)) / len(actuals_values))
    return rmsle

In [11]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor

In [12]:
###KNeighbours regression######

In [13]:
## onehot encoding the features
cols_to_encode=['work_type','work_priority','domain','platform','components']
onehotencoder = OneHotEncoder(sparse=False,handle_unknown='ignore')
train_features = onehotencoder.fit_transform(train_df[cols_to_encode])
test_features = onehotencoder.transform(test_df[cols_to_encode])
##applying knn algorithm
knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(train_features, train_times.duration_in_days)
#calculating rmsle value
train_predictions = pd.DataFrame(train_times.work_item)  
train_predictions['predictions'] = knn.predict(train_features)
test_predictions = pd.DataFrame(test_times.work_item)
test_predictions['predictions'] = knn.predict(test_features)
print(rmsle(train_times, train_predictions))
print(rmsle(test_times, test_predictions))

1.670509639375778
1.4284558887117986


In [14]:
# All work items that are finished become our training data, all open work items our test data
closed_work_items = set(closed_times.work_item)  
open_work_items = set(open_times.work_item)  
closed_df = df_start_only[df_start_only.work_item.isin(closed_work_items)]  
open_df = df_start_only[df_start_only.work_item.isin(open_work_items)]

# Compute the features
onehotencoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
cols_to_encode = ['work_type', 'work_priority','domain', 'platform','components']
closed_features = onehotencoder.fit_transform(closed_df[cols_to_encode])
open_features = onehotencoder.transform(open_df[cols_to_encode])

model = KNeighborsRegressor()
model.fit(closed_features, closed_times.duration_in_days)
open_predictions = pd.DataFrame(open_times.work_item)
open_predictions['predictions'] = model.predict(open_features)

with open('open_predictions_knn_wt_wp1.csv', 'w') as f:
    open_predictions.to_csv(f, index=False)

In [15]:
#######linear regression#############

In [16]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(closed_features,closed_times.duration_in_days)
predictions = lm.predict(open_features)

In [17]:
open_predictions = pd.DataFrame(open_times.work_item)
open_predictions['predictions'] = lm.predict(open_features)
with open('open_predictions_lr_wt_wp1.csv', 'w') as f:
    open_predictions.to_csv(f, index=False)

In [18]:
def eval_model(model, train_features, train_times, test_features, test_times):
    model.fit(train_features, train_times.duration_in_days)
    train_predictions = pd.DataFrame(train_times.work_item)  
    train_predictions['predictions'] = model.predict(train_features)
    test_predictions = pd.DataFrame(test_times.work_item)
    test_predictions['predictions'] = model.predict(test_features)
    train_rmsle = rmsle(train_times, train_predictions)
    test_rmsle = rmsle(test_times, test_predictions)
    return train_rmsle, test_rmsle

In [19]:
def orig_model(model, closed_features, closed_times, open_features, open_times):
    model.fit(closed_features, closed_times.duration_in_days)
    closed_predictions = pd.DataFrame(closed_times.work_item)  
    closed_predictions['predictions'] = model.predict(closed_features)
    open_predictions = pd.DataFrame(open_times.work_item)
    open_predictions['predictions'] = model.predict(open_features)
    closed_rmsle = rmsle(closed_times, closed_predictions)
    open_rmsle = rmsle(open_times, open_predictions)
    with open('open_predictions_dtr.csv', 'w') as f:
        open_predictions.to_csv(f, index=False)
    return closed_rmsle, open_rmsle

In [20]:
model = XGBRegressor()
eval_model(model, train_features, train_times, test_features, test_times)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


(1.764577435636904, 1.250806711915415)

In [21]:
print(model)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)


In [22]:
orig_model(model, closed_features, closed_times, open_features, open_times)

(1.8160385734183933, nan)

In [23]:
model = DecisionTreeRegressor(max_depth=5)
eval_model(model, train_features, train_times, test_features, test_times)
#orig_model(model, closed_features, closed_times, open_features, open_times)

(1.8105012372931064, 1.2973108103859685)

In [24]:
newdf=pr_df[pr_df['work_item'].isin(closed_work_items)]
newtestdf=pr_df[pr_df['work_item'].isin(open_work_items)]

NameError: name 'pr_df' is not defined

In [None]:
pr_df=pd.get_dummies(df,columns=['work_type','work_priority','domain','platform','components','to_phase','to_resource'])

In [None]:
len(closed_work_items) #10522
len(open_work_items) #1042

In [None]:
resdf=newdf.drop(['timestamp','from_phase','from_resource'],axis=1)
tresdf=newtestdf.drop(['timestamp','from_phase','from_resource'],axis=1)
resdf=resdf.groupby('work_item').sum()
tresdf=tresdf.groupby('work_item').sum()

In [None]:
resdf=pd.merge(resdf,closed_times,on='work_item',how='left')
resdf.drop(['start','end','duration'],axis=1,inplace=True)

In [None]:
tresdf=pd.merge(tresdf,open_times,on='work_item',how='left')
tresdf.drop(['start','end','duration'],axis=1,inplace=True)

In [None]:
x_train=resdf.drop('duration_in_days',axis=1)
y_train=resdf['duration_in_days']
x_test=tresdf.drop('duration_in_days',axis=1)

In [None]:
knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(x_train.drop(['work_item'],axis=1), y_train)
#calculating rmsle value
#y_test = pd.DataFrame(tresdf.work_item)  
#y_test['predictions'] = knn.predict(x_test.drop(['work_item'],axis=1))

In [None]:
y_test = pd.DataFrame(tresdf.work_item)  
y_test['predictions'] = knn.predict(x_test.drop(['work_item'],axis=1))

In [None]:
lm = LinearRegression()
lm.fit(x_train.drop(['work_item'],axis=1), y_train)


In [None]:
model = XGBRegressor()
model.fit(x_train.drop(['work_item'],axis=1), y_train)

In [None]:
y_test = pd.DataFrame(tresdf.work_item)  
y_test['predictions'] =lm.predict(x_test.drop(['work_item'],axis=1))
y_test.to_csv('new_xgb.csv',index=False)

In [51]:
import sys
import imp

In [48]:
import source_func 

In [52]:
imp.reload(source_func)

<module 'source_func' from 'C:\\Users\\bhavrang\\Documents\\Analytics vidhya\\GlobalDataChallenge-II\\source_func.py'>

In [53]:
dir(source_func)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'func']

In [54]:
print(source_func.func())

inside function
3
