In [1]:
import pandas as pd
import json 
import re
import numpy as np

In [2]:
df_metrics = pd.read_csv('../data/2d-03_10_2018.csv')
df_metrics.shape[0]

516

In [3]:
with open('../data/logs_exracted.json') as f:
    a = json.load(f)
df_logs = pd.DataFrame.from_dict({(i,j): a[i][j] 
                           for i in a.keys() 
                           for j in a[i].keys()},orient='index').reset_index()

df_logs= df_logs.fillna(0)
df_logs['win_diff'] = df_logs.apply(lambda x: (x['win']-x['loss'])/x['games'],axis=1)
df_logs['win_rate'] = df_logs.apply(lambda x: (x['win'])/x['games'],axis=1)
df_logs['goals_diff'] = df_logs.apply(lambda x: float(x['S']-x['R'])/x['games'],axis=1)
df_logs['loss_rate'] = df_logs.apply(lambda x: (x['loss'])/x['games'],axis=1)
df_logs = df_logs.rename({k:'results_' + k for k in df_logs.select_dtypes(include=np.number)},axis=1)
two_groups = '(?P<competition>[a-zA-Z\-\_]+)(?P<year>[0-9]+)'

df_logs = pd.concat([df_logs,df_logs['level_0'].str.extract(two_groups)],axis=1)
df_logs['year'] = df_logs['year'].astype('int64')
df_logs = df_logs.drop(['level_0'],axis=1)
df_logs = df_logs.rename({'level_1':'team'},axis=1)




df_logs.tail()

Unnamed: 0,team,results_loss,results_S,results_R,results_games,results_tie,results_win,results_win_diff,results_win_rate,results_goals_diff,results_loss_rate,competition,year
1288,nexus2d,4.0,8,14,6,0.0,2.0,-0.333333,0.333333,-1.0,0.666667,WorldCup,2017
1289,oxsy,0.0,19,3,6,0.0,6.0,1.0,1.0,2.666667,0.0,WorldCup,2017
1290,persiangulf2017,2.0,6,12,6,2.0,2.0,0.0,0.333333,-1.0,0.333333,WorldCup,2017
1291,wit,6.0,0,22,6,0.0,0.0,-1.0,0.0,-3.666667,1.0,WorldCup,2017
1292,ziziphus,4.0,4,18,6,1.0,1.0,-0.5,0.166667,-2.333333,0.666667,WorldCup,2017


In [4]:
df_logs['team'] = [x.strip('_-') for x in df_logs['team']]

In [5]:
df_logs['competition'] = df_logs['competition'].str.replace(u'JapanAutumnCamp','AutumnCamp')

In [7]:
df_logs.to_csv('../data/logs_extracted.csv')

In [8]:
df_logs['team'] = df_logs['team'].str.lower()

In [9]:
df_logs = df_logs.groupby(['year','competition','team']).sum().reset_index()

In [10]:
df_logs.shape[0]

1291

In [11]:
df_logs = df_logs[~(df_logs['team'].str.len()<3)]

#### Load manually linked logs to program

In [12]:
df_mapped = pd.read_csv('../data/group_names_mapped.csv',usecols=['competition','year','team_manual','group_name'])
df_mapped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528 entries, 0 to 527
Data columns (total 4 columns):
competition    528 non-null object
group_name     528 non-null object
year           528 non-null int64
team_manual    60 non-null object
dtypes: int64(1), object(3)
memory usage: 16.6+ KB


In [13]:
df_merged_mapping = df_logs.merge(df_mapped,left_on=['competition','year','team'],right_on=['competition','year','team_manual'],how='left')
df_merged_mapping['team_alt'] = df_merged_mapping.apply(lambda x: x.group_name if pd.notnull(x.group_name) else x.team ,axis=1)

In [14]:
df_merged_mapping.shape[0]

1257

In [15]:
df_merged_mapping = df_merged_mapping.drop('group_name',axis=1)

#### number manually linked programs : 

In [16]:
df_merged_mapping.shape[0]

1257

#### Number of not liked groups

### Automatically linking groups to logs

In [17]:
# join logic between team name in the logs and group name in the program repository folder 
def is_in_group(x,):
    r= re.match(r'([a-zA-Z\-]+)',x.team_alt)
    r2 = re.match(r'([a-zA-Z\-0-9]+)',x.team_alt)
    if not r:
        return False
    return r.groups()[0].lower() in x.group_name.lower(), len(r2.groups()[0])

In [18]:
# hanldes cases where the team name apears twice in different format 
def len_in_group_dup(x):
    if x.duplicate_team < 2: 
        return True
    r= re.match(r'([a-zA-Z\-0-9]+)',x.team_alt)
    if not r:
        return False
    if r.groups()[0].lower() in x.group_name.lower():
        return len(r.groups()[0])

In [19]:
df_mix = df_metrics.merge(df_merged_mapping,on=['competition','year'])


In [20]:
df_mix = df_mix.join(df_mix.apply(is_in_group,axis=1,result_type='expand').
                     rename({0:'exist',1:'length'},axis=1))


In [21]:
df = df_mix[df_mix['exist']==True].copy()

In [22]:
df.loc[:,'max'] = df.groupby(['competition','year','group_name'])['length'].transform(max)

In [23]:
df = df[df['length']==df['max']]

In [24]:
g = df.groupby(['competition','year','group_name'])['Current_group_location'].transform(lambda x: x.count())

In [25]:
df_metrics.shape[0]

516

In [None]:
df1 = df_metrics.merge(df,on=['competition','year','group_name'],how='left',indicator=True)
df1[df1['_merge']=='both'].shape[0]

In [52]:
df1[['competition','group_name','year','team_manual','_merge','is_source_x']].sort_values(['year','group_name']).to_csv('../checkpoints/not_found_group_names.csv')


In [30]:
df2 = df_logs.merge(df,how='left',on=['competition','year','team'])

In [31]:
df_logs.count()

year                  1251
competition           1251
team                  1251
results_loss          1251
results_S             1251
results_R             1251
results_games         1251
results_tie           1251
results_win           1251
results_win_diff      1251
results_win_rate      1251
results_goals_diff    1251
results_loss_rate     1251
dtype: int64

In [34]:
df2[(df2['group_name'].isnull())][['competition','year','team']].sort_values(['competition','year','team']).to_csv('../checkpoints/not_found_team.csv')


In [35]:
r= re.match(r'([a-zA-Z\-0-9]+)','oxblue08')

In [36]:
r.group(0).lower()

'oxblue08'

In [37]:
'OxBlue_rc08_release'.lower()

'oxblue_rc08_release'

In [38]:
df.to_csv('../data/merged-logs_repo.csv')

In [None]:
df_cp

In [51]:
df1[(df1['is_source_x']==True) & (df1['_merge']=='left_only')]

Unnamed: 0,Current_group_location_x,Exe_files_x,IF4_x,IF4_per_module_x,IF4_visible_x,IF4_visible_per_module_x,McCabes_cyclomatic_complexity_x,McCabes_cyclomatic_complexity_per_line_of_comment_x,McCabes_cyclomatic_complexity_per_module_x,Number_exe_files_x,...,results_win_diff,results_win_rate,results_goals_diff,results_loss_rate,team_manual,team_alt,exist,length,max,_merge
27,2009_PRIMA_BKB_PRIMA2009,/home/se-metrics/root/2d-proccesed//2009_PRIMA...,1.0,0.013,1.0,0.013,2368.0,0.711,31.158,3,...,,,,,,,,,,left_only
55,2000_WorldCup_BS2K_Melbourne2000_CodeRelease,,558.0,5.264,558.0,5.264,4646.0,1.238,43.83,0,...,,,,,,,,,,left_only
58,2001_WorldCup_FuzzyFoo-2001-source,,,,,,,,,0,...,,,,,,,,,,left_only
68,1998_WorldCup_andhill98,,0.0,0.0,0.0,0.0,821.0,2.783,821.0,0,...,,,,,,,,,,left_only
85,2012_WorldCup_marlik,/home/se-metrics/root/2d-proccesed//2012_World...,518.0,4.544,518.0,4.544,6590.0,0.882,57.807,2,...,,,,,,,,,,left_only
90,1996_WorldCup_sekine_client,,0.0,0.0,0.0,0.0,969.0,2.893,969.0,0,...,,,,,,,,,,left_only
97,2000_WorldCup_RC2000-oz,/home/se-metrics/root/2d-proccesed//2000_World...,,,,,,,,117,...,,,,,,,,,,left_only
147,1998_WorldCup_ath97,,435.0,15.0,435.0,15.0,1166.0,1.565,40.207,0,...,,,,,,,,,,left_only
244,2007_WorldCup_kotug,,,,,,,,,0,...,,,,,,,,,,left_only
253,1997_WorldCup_andhill_RoboCup97,,0.0,0.0,0.0,0.0,331.0,2.149,331.0,0,...,,,,,,,,,,left_only


In [None]:
df1[(df1['is_source_x']==True) & (df1['_merge']=='left_only')]