# Pull Requests Dataset Preparation

## Load Libraries

In [34]:
from data_preparation import *
from datetime import datetime
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Load Dataset

In [2]:
# Paths to datasets
prs_vm1_path = '../data/prs-vm1.csv'
prs_vm2_path = '../data/prs-vm2.csv'

# Combine datasets from different sources
df_prs = pd.concat(map(pd.read_csv, [prs_vm1_path, prs_vm2_path]))
df_prs.head()

Unnamed: 0,cursor,owner,name,sshUrl,url,baseRepo,baseRef,baseRefPrefix,headRepo,headRef,...,files,createdAt,publishedAt,mergedAt,closedAt,groupId,artifactId,version,filePath,timestamp
0,Y3Vyc29yOjY=,javaparser,javaparser,ssh://git@github.com:javaparser/javaparser.git,https://github.com/javaparser/javaparser,javaparser/javaparser,master,refs/heads/,turrisxyz/javaparser,Pinned-Dependencies-GitHub,...,2,2022-06-26,2022-06-26,,,com.github.javaparser,javaparser-parent,3.24.3-SNAPSHOT,.github/workflows/create_github_release.yml,
1,Y3Vyc29yOjY=,javaparser,javaparser,ssh://git@github.com:javaparser/javaparser.git,https://github.com/javaparser/javaparser,javaparser/javaparser,master,refs/heads/,turrisxyz/javaparser,Pinned-Dependencies-GitHub,...,2,2022-06-26,2022-06-26,,,com.github.javaparser,javaparser-parent,3.24.3-SNAPSHOT,.github/workflows/maven_tests.yml,
2,Y3Vyc29yOjY=,javaparser,javaparser,ssh://git@github.com:javaparser/javaparser.git,https://github.com/javaparser/javaparser,javaparser/javaparser,master,refs/heads/,turrisxyz/javaparser,Pinned-Dependencies-GitHub,...,2,2022-06-26,2022-06-26,,,com.github.javaparser,javaparser-parent,3.24.3-SNAPSHOT,.github/workflows/create_github_release.yml,
3,Y3Vyc29yOjY=,javaparser,javaparser,ssh://git@github.com:javaparser/javaparser.git,https://github.com/javaparser/javaparser,javaparser/javaparser,master,refs/heads/,turrisxyz/javaparser,Pinned-Dependencies-GitHub,...,2,2022-06-26,2022-06-26,,,com.github.javaparser,javaparser-parent,3.24.3-SNAPSHOT,.github/workflows/maven_tests.yml,
4,Y3Vyc29yOjY=,javaparser,javaparser,ssh://git@github.com:javaparser/javaparser.git,https://github.com/javaparser/javaparser,javaparser/javaparser,master,refs/heads/,iChenLei/javaparser,patch-1,...,1,2022-06-17,2022-06-17,,,com.github.javaparser,javaparser-parent,3.24.3-SNAPSHOT,readme.md,


In [28]:
df_prs['state']

0         OPEN
1         OPEN
2         OPEN
3         OPEN
4         OPEN
          ... 
564661    OPEN
564662    OPEN
564663    OPEN
564664    OPEN
564665    OPEN
Name: state, Length: 997289, dtype: object

## Validate Data

In [3]:
rows, columns = df_prs.shape
df_prs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 997289 entries, 0 to 564665
Data columns (total 25 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   cursor         997289 non-null  object 
 1   owner          997289 non-null  object 
 2   name           997289 non-null  object 
 3   sshUrl         997289 non-null  object 
 4   url            997289 non-null  object 
 5   baseRepo       997289 non-null  object 
 6   baseRef        997289 non-null  object 
 7   baseRefPrefix  997289 non-null  object 
 8   headRepo       997289 non-null  object 
 9   headRef        997289 non-null  object 
 10  headRefPrefix  997289 non-null  object 
 11  title          997289 non-null  object 
 12  number         997289 non-null  int64  
 13  state          997289 non-null  object 
 14  draft          997289 non-null  bool   
 15  files          997289 non-null  int64  
 16  createdAt      997289 non-null  object 
 17  publishedAt    997289 non-nul

All columns seem to have non-null values.

### Sanity Checks

**Validate `sshUrl` column:** Verify that all SSH URLs are correctly build and match the expected repository owner/name.

In [4]:
ssh_url_check = validate_urls('sshUrl', df_prs)
assert ssh_url_check, 'sshUrl check'

Records have right scheme: True
Records have right authority: True
Records have right path: True


**Validate `url` column:** Verify that all HTTP URLs are correctly build and match the expected repository owner/name.

In [5]:
http_url_check = validate_urls('url', df_prs)
assert http_url_check, 'url check'

Records have right scheme: True
Records have right authority: True
Records have right path: True


**Validate `baseRepo` column:** Verify that all head repository values follow the format "owner/name" and match the expected repository owner/name values.

In [23]:
base_repo_check = validate_repo_name('baseRepo', df_prs)
assert base_repo_check, 'baseRepo check'

Records have right repository name: True


**Validate `headRepo` column:** Verify that all head repository values follow the format "owner/name".

In [26]:
head_repo_check = validate_repo_name_format('headRepo', df_prs)
assert head_repo_check, 'headRepo check'

Records have right repository name: True


**Validate `number` column:** Verify that all records are greater than 0.

In [None]:
number_check = validate_limit_num(1, 'number', df_prs)
assert number_check, 'number check'

**Validate `state` column:** Veridy that all pull requests are open.

In [30]:
state_check = validate_value('OPEN', 'state', df_prs)
assert state_check

Records have value OPEN: True


**Validate `files` column:** Verify that all records are greater than 0.

In [37]:
files_check = validate_limit_num(1, 'files', df_prs)
assert files_check, 'files check'

Records have at least 1: True


**Validate `createdAt` column:** Verify that all dates have the expected format "yyyy-MM-dd".

In [44]:
last_created_date = datetime(2022, 4, 22)
created_at_check = validate_date('createdAt', df_prs)
created_at_limit_check = validate_limit_date(last_created_date, 'createdAt', df_prs)
assert created_at_check and created_at_limit_check, 'createdAt check'

Records have right format: True
Records appear after 2022-04-22 00:00:00: False
                  cursor    owner     name  \
311606  Y3Vyc29yOjE1Mw==  xuxueli  xxl-job   
311607  Y3Vyc29yOjE1Mw==  xuxueli  xxl-job   
311608  Y3Vyc29yOjE1Mw==  xuxueli  xxl-job   
311609  Y3Vyc29yOjE1Mw==  xuxueli  xxl-job   
311610  Y3Vyc29yOjE1Mw==  xuxueli  xxl-job   
...                  ...      ...      ...   
311895  Y3Vyc29yOjE1Mw==  xuxueli  xxl-job   
311896  Y3Vyc29yOjE1Mw==  xuxueli  xxl-job   
311897  Y3Vyc29yOjE1Mw==  xuxueli  xxl-job   
311898  Y3Vyc29yOjE1Mw==  xuxueli  xxl-job   
311899  Y3Vyc29yOjE1Mw==  xuxueli  xxl-job   

                                          sshUrl  \
311606  ssh://git@github.com:xuxueli/xxl-job.git   
311607  ssh://git@github.com:xuxueli/xxl-job.git   
311608  ssh://git@github.com:xuxueli/xxl-job.git   
311609  ssh://git@github.com:xuxueli/xxl-job.git   
311610  ssh://git@github.com:xuxueli/xxl-job.git   
...                                          ...   
311

AssertionError: createdAt check

**Validate `pushedAt` column:** Verify that all dates have the expected format "yyyy-MM-dd" and appear after a limit date.

In [40]:
last_pushed_date = datetime(2022, 4, 22)
pushed_at_format_check = validate_date('pushedAt', df_prs)
pushed_at_limit_check = validate_limit_date(last_pushed_date, 'pushedAt', df_prs)
assert pushed_at_format_check and pushed_at_limit_check, 'pushedAt check'

KeyError: 'pushedAt'

## Clean Dataset

In [31]:
# Remove non java files
# Shall we discard draft pull requests?

## Store Cleaned Dataset