### Read Pickled PR Data

* If the folder `Data/cleaned_PRs` isn't present, then extract `cleaned_PRs_2.zip` into `Data/`

In [2]:
import os
import string

import nltk
import numpy as np
import pandas as pd

In [3]:
os.listdir('Data/cleaned_PRs')

['AE9RB-ruby-redis_pr.pk1',
 'AlburIvan-SlickForm_pr.pk1',
 'AlexLiuSheng-CheckVersionLib_pr.pk1',
 'AllenDowney-ThinkJavaCode_pr.pk1',
 'Aufree-ESTMusicIndicator_pr.pk1',
 'Azure-azure-storage-fuse_pr.pk1',
 'BMIRDS-deepslide_pr.pk1',
 'BetterCloud-vault-java-driver_pr.pk1',
 'BurntSushi-quickcheck_pr.pk1',
 'BycorSanchez-resources_pr.pk1',
 'CMU-Perceptual-Computing-Lab-openpose_pr.pk1',
 'CVCalendar-CVCalendar_pr.pk1',
 'CaoZ-JD-Coin_pr.pk1',
 'ChrisXu-CXAlertView_pr.pk1',
 'ClickSimply-Nano-SQL_pr.pk1',
 'Coalfire-Research-Slackor_pr.pk1',
 'CollabCodeTech-forum-do-front-ao-end_pr.pk1',
 'CraneStation-wasmtime_pr.pk1',
 'CreateJS-EaselJS_pr.pk1',
 'Crizstian-cinema-microservice_pr.pk1',
 'Cue-hop_pr.pk1',
 'DHTMLX-message_pr.pk1',
 'Deadcows-MyBox_pr.pk1',
 'DexterInd-GoPiGo_pr.pk1',
 'EricSteinberger-PokerRL_pr.pk1',
 'Facepunch-garrysmod_pr.pk1',
 'FactoryBoy-factory_boy_pr.pk1',
 'FlingOS-FlingOS_pr.pk1',
 'FlowingMedia-TimeFlow_pr.pk1',
 'FooBarWidget-default_value_for_pr.pk1',

In [4]:
# Read all pk1 PR files from Data/cleaned_PRs
pickled_dirs = []
repo_names = []
for file in os.listdir('Data/cleaned_PRs'):
    try:
        pickled_dirs.append('Data/cleaned_PRs/'+file)
        repo_names.append(file.rstrip('_pr.pk1'))
    except:
        print('NO PULL REQUEST DATA IN: ', file)
        continue

print('\nFile Paths\n', pickled_dirs[:20])
print('\nRepo Names\n', repo_names[:20])


File Paths
 ['Data/cleaned_PRs/AE9RB-ruby-redis_pr.pk1', 'Data/cleaned_PRs/AlburIvan-SlickForm_pr.pk1', 'Data/cleaned_PRs/AlexLiuSheng-CheckVersionLib_pr.pk1', 'Data/cleaned_PRs/AllenDowney-ThinkJavaCode_pr.pk1', 'Data/cleaned_PRs/Aufree-ESTMusicIndicator_pr.pk1', 'Data/cleaned_PRs/Azure-azure-storage-fuse_pr.pk1', 'Data/cleaned_PRs/BMIRDS-deepslide_pr.pk1', 'Data/cleaned_PRs/BetterCloud-vault-java-driver_pr.pk1', 'Data/cleaned_PRs/BurntSushi-quickcheck_pr.pk1', 'Data/cleaned_PRs/BycorSanchez-resources_pr.pk1', 'Data/cleaned_PRs/CMU-Perceptual-Computing-Lab-openpose_pr.pk1', 'Data/cleaned_PRs/CVCalendar-CVCalendar_pr.pk1', 'Data/cleaned_PRs/CaoZ-JD-Coin_pr.pk1', 'Data/cleaned_PRs/ChrisXu-CXAlertView_pr.pk1', 'Data/cleaned_PRs/ClickSimply-Nano-SQL_pr.pk1', 'Data/cleaned_PRs/Coalfire-Research-Slackor_pr.pk1', 'Data/cleaned_PRs/CollabCodeTech-forum-do-front-ao-end_pr.pk1', 'Data/cleaned_PRs/CraneStation-wasmtime_pr.pk1', 'Data/cleaned_PRs/CreateJS-EaselJS_pr.pk1', 'Data/cleaned_PRs/Crizs

In [5]:
# Unpickle PR files
df_list = [pd.read_pickle(pickled_df) for pickled_df in pickled_dirs]

In [6]:
# View two PR DataFrames
df_list[6].head()

Unnamed: 0,authorAssociation,bodyText,closedAt,comments,createdAt,resourcePath,state,title,updatedAt,author,company,files,mergedBy
0,COLLABORATOR,…a single method.,2016-01-17 21:21:05+00:00,"{'totalCount': 0, 'nodes': []}",2016-01-17 21:20:38+00:00,/BetterCloud/vault-java-driver/pull/1,MERGED,"Modifies REST client to use per-verb methods, ...",2016-01-17 21:21:05+00:00,steve-perkins,@BetterCloud,3,steve-perkins-bc
1,COLLABORATOR,,2016-01-17 23:40:39+00:00,"{'totalCount': 0, 'nodes': []}",2016-01-17 23:40:20+00:00,/BetterCloud/vault-java-driver/pull/2,MERGED,Adds stubs for 'put' and 'delete' ops in REST ...,2016-01-17 23:40:39+00:00,steve-perkins,@BetterCloud,6,steve-perkins-bc
2,CONTRIBUTOR,The withRetries() method is on the Vault class...,2016-03-01 16:37:31+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2016-03-01 04:31:45+00:00,/BetterCloud/vault-java-driver/pull/3,MERGED,Fix position of withRetries() in call chain,2016-03-01 16:37:37+00:00,happybob007,,1,steve-perkins-bc
3,CONTRIBUTOR,Missed a spot in my prior PR. :(,2016-03-01 16:37:46+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2016-03-01 04:35:43+00:00,/BetterCloud/vault-java-driver/pull/4,MERGED,Fix position of withRetries() in call chain,2016-03-01 16:37:51+00:00,happybob007,,1,steve-perkins-bc
4,CONTRIBUTOR,Fixes #8,2016-03-20 01:07:26+00:00,"{'totalCount': 0, 'nodes': []}",2016-03-19 18:40:03+00:00,/BetterCloud/vault-java-driver/pull/9,MERGED,Keys with null values cause UnsupportedOperati...,2016-03-20 01:07:26+00:00,kunickiaj,@streamsets,2,steve-perkins


---------------
### Collect Text Data & Tokenize

In [7]:
# Columns of text data
text_cols = ['bodyText', 'comments', 'title']

# Sample from df_list
sample_df = df_list[6]

sample_df[text_cols].head()

Unnamed: 0,bodyText,comments,title
0,…a single method.,"{'totalCount': 0, 'nodes': []}","Modifies REST client to use per-verb methods, ..."
1,,"{'totalCount': 0, 'nodes': []}",Adds stubs for 'put' and 'delete' ops in REST ...
2,The withRetries() method is on the Vault class...,"{'totalCount': 1, 'nodes': [{'author': {'login...",Fix position of withRetries() in call chain
3,Missed a spot in my prior PR. :(,"{'totalCount': 1, 'nodes': [{'author': {'login...",Fix position of withRetries() in call chain
4,Fixes #8,"{'totalCount': 0, 'nodes': []}",Keys with null values cause UnsupportedOperati...


In [8]:
# Check NLTK's tokenization
bodyText_obs = sample_df['bodyText'].values.tolist()
bodyText_tokens = []

for obs in bodyText_obs:
    # Remove punctuation
    for punct in string.punctuation:
        obs = obs.replace(punct, '')

    sent = nltk.sent_tokenize(obs)
    
    for words in sent:
        bodyText_tokens.append(nltk.word_tokenize(words))

In [9]:
print(bodyText_tokens[0], bodyText_tokens[-1])

['…a', 'single', 'method'] ['Unclear', 'if', 'this', 'is', 'wanted', 'but', 'here', 'is', 'my', 'suggestion', 'of', 'a', 'zerodep', 'idiomatic', 'proxy', 'support', 'addition', 'to', 'your', 'API', 'NotesRationale', 'Use', 'of', 'things', 'like', 'proxy', 'properties', 'is', 'kind', 'of', 'bad', 'People', 'who', 'need', 'to', 'use', 'a', 'proxy', 'to', 'access', 'Vault', 'may', 'not', 'want', 'the', 'proxy', 'configuration', 'globally', 'set', 'for', 'the', 'JDK', 'Similarly', 'the', 'current', 'API', 'behaviour', 'of', 'implicitly', 'relying', 'on', 'the', 'ProxySelectorgetDefault', 'through', 'URLopenConnection', 'is', 'rather', 'evil', 'as', 'this', 'means', 'that', 'the', 'only', 'way', 'to', 'control', 'the', 'proxy', 'requires', 'race', 'conditions', 'with', 'other', 'JDK', 'code', 'nevermind', 'that', 'in', 'some', 'cases', 'the', 'JRE', 'permissions', 'may', 'have', 'blocked', 'setProxySelector', 'from', 'the', 'application', 'The', 'JREs', 'Proxy', 'class', 'does', 'not', 'enc

--------------------
### Analyze PR `bodyText` using NLTK's `vader.SentimentIntensityAnalyzer`

For future reference, instead of using Vader, a NaiveBayesClassifier could be trained and used for disparate types of text data.

NLTK docs on both methods: http://www.nltk.org/howto/sentiment.html

Code tutorial for both: https://opensourceforu.com/2016/12/analysing-sentiments-nltk/

In [10]:
# Perform S.A. on bodyText_tokens
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/joseph/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [11]:
def get_unpacked_comments(df):
    '''df is the dataframe you want to extract from. Column_name is what the new column will be named'''
    comment_list = []
    column_list = []

    for thing in df.comments:
        if (thing['totalCount'] != 0):
            for com in thing['nodes']:
                comment_list.append(com['bodyText'])
            column_list.append(comment_list)
            # Reset comments after each row
            comment_list = []
        else:
            column_list.append([])
            
    return column_list

def get_bodyText(df):
    bodyText_obs = df['bodyText'].values.tolist()
    bodyText_tokens = []
    bodyText_sentiment = []
    
    no_sentiment_data = {'neg': np.NaN, 'neu': np.NaN, 'pos': np.NaN, 'compound': np.NaN}
    sid = SentimentIntensityAnalyzer()

    # Vader expects a single string, not a list of tokenized strings
    for obs in bodyText_obs:
        # Remove punctuation
        for punct in string.punctuation:
            obs = obs.replace(punct, '')

        # Tokenize bodyText
        sent = nltk.sent_tokenize(obs)
        bodyText_tokens.append(sent)

        # If there's text: calcualte polarity score, else append NaNs
        if len(sent) > 0:
            polarity_score = sid.polarity_scores(sent[0])
            bodyText_sentiment.append(polarity_score)
        else:
            bodyText_sentiment.append(no_sentiment_data)
            
    # Convert bodyText_sent to pandas df
    sent_df = pd.DataFrame(bodyText_sentiment)

    # Add sentiment columns
    # text_cols = ['bodyText', 'comments', 'title']
    # sentiment_df = df[text_cols]
    sentiment_df = df.copy()
    sentiment_df['bodyText_compound'] = sent_df['compound']
    sentiment_df['bodyText_pos'] = sent_df['pos']
    sentiment_df['bodyText_neu'] = sent_df['neu']
    sentiment_df['bodyText_neg'] = sent_df['neg']

    # Unpack comments
    sentiment_df['comments_unpacked'] = get_unpacked_comments(df)
    sentiment_df['comments_unpacked'] = sentiment_df['comments_unpacked'].str.join(' ')
#     sentiment_df = sentiment_df[['bodyText', 'bodyText_compound', 
#                          'bodyText_pos','bodyText_neu', 
#                          'bodyText_neg', 'comments', 
#                          'comments_unpacked']]

    return sentiment_df

`sample_df` is from `df_list[6]` so the name of the repo should be the equivalent index in `repo_names`.

In [12]:
repo_names[6]

'BMIRDS-deepslide'

In [13]:
deepslide_df = get_bodyText(sample_df)

-----------
### Analyze PR `comments` Sentiment

1. Make a column containing list of comments per PR
2. Calculate sentiment for concatenated comments per PR
3. Append a column of a single sentiment value for each comment list per PR

In [14]:
deepslide_df.head()

Unnamed: 0,authorAssociation,bodyText,closedAt,comments,createdAt,resourcePath,state,title,updatedAt,author,company,files,mergedBy,bodyText_compound,bodyText_pos,bodyText_neu,bodyText_neg,comments_unpacked
0,COLLABORATOR,…a single method.,2016-01-17 21:21:05+00:00,"{'totalCount': 0, 'nodes': []}",2016-01-17 21:20:38+00:00,/BetterCloud/vault-java-driver/pull/1,MERGED,"Modifies REST client to use per-verb methods, ...",2016-01-17 21:21:05+00:00,steve-perkins,@BetterCloud,3,steve-perkins-bc,0.0,0.0,1.0,0.0,
1,COLLABORATOR,,2016-01-17 23:40:39+00:00,"{'totalCount': 0, 'nodes': []}",2016-01-17 23:40:20+00:00,/BetterCloud/vault-java-driver/pull/2,MERGED,Adds stubs for 'put' and 'delete' ops in REST ...,2016-01-17 23:40:39+00:00,steve-perkins,@BetterCloud,6,steve-perkins-bc,,,,,
2,CONTRIBUTOR,The withRetries() method is on the Vault class...,2016-03-01 16:37:31+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2016-03-01 04:31:45+00:00,/BetterCloud/vault-java-driver/pull/3,MERGED,Fix position of withRetries() in call chain,2016-03-01 16:37:37+00:00,happybob007,,1,steve-perkins-bc,0.0,0.0,1.0,0.0,Thanks!
3,CONTRIBUTOR,Missed a spot in my prior PR. :(,2016-03-01 16:37:46+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2016-03-01 04:35:43+00:00,/BetterCloud/vault-java-driver/pull/4,MERGED,Fix position of withRetries() in call chain,2016-03-01 16:37:51+00:00,happybob007,,1,steve-perkins-bc,-0.296,0.0,0.694,0.306,Thanks!
4,CONTRIBUTOR,Fixes #8,2016-03-20 01:07:26+00:00,"{'totalCount': 0, 'nodes': []}",2016-03-19 18:40:03+00:00,/BetterCloud/vault-java-driver/pull/9,MERGED,Keys with null values cause UnsupportedOperati...,2016-03-20 01:07:26+00:00,kunickiaj,@streamsets,2,steve-perkins,0.0,0.0,1.0,0.0,


In [25]:
def score_comments(df):
    """ Given a DataFrame with `unpacked_comments`, return a df with sentiment scores for comments."""
    commentText_list = df['comments_unpacked'].values.tolist()
    commentText_sentiment = []
    no_sentiment_data = {'neg': np.NaN, 'neu': np.NaN, 'pos': np.NaN, 'compound': np.NaN}
    
    sid = SentimentIntensityAnalyzer()

    for obs in commentText_list:
        if len(obs) == 0:
            commentText_sentiment.append(no_sentiment_data)
        else:
            # Converts list of comments to a single `str` comment
            combined_text = ''.join(obs)

            # Remove punctuation
            combined_text = combined_text.replace('\n', '')
            for punct in string.punctuation:
                combined_text = combined_text.replace(punct, '')

            # Tokenize bodyText: this function returns a single-item list
            # This will catch comments that are only punctuation/empty and append NaNs
            try:
                sent = nltk.sent_tokenize(combined_text)[0]
                polarity_score = sid.polarity_scores(sent)
                commentText_sentiment.append(polarity_score)
            except:
                commentText_sentiment.append(no_sentiment_data)

            del combined_text
    
    sent_df = pd.DataFrame(commentText_sentiment)
    df['comments_compound'] = sent_df['compound']
    df['comments_pos'] = sent_df['pos']
    df['comments_neu'] = sent_df['neu']
    df['comments_neg'] = sent_df['neg']
    
    return df

In [16]:
deepslide_df = score_comments(deepslide_df)
deepslide_df.head()

Unnamed: 0,authorAssociation,bodyText,closedAt,comments,createdAt,resourcePath,state,title,updatedAt,author,...,mergedBy,bodyText_compound,bodyText_pos,bodyText_neu,bodyText_neg,comments_unpacked,comments_compound,comments_pos,comments_neu,comments_neg
0,COLLABORATOR,…a single method.,2016-01-17 21:21:05+00:00,"{'totalCount': 0, 'nodes': []}",2016-01-17 21:20:38+00:00,/BetterCloud/vault-java-driver/pull/1,MERGED,"Modifies REST client to use per-verb methods, ...",2016-01-17 21:21:05+00:00,steve-perkins,...,steve-perkins-bc,0.0,0.0,1.0,0.0,,,,,
1,COLLABORATOR,,2016-01-17 23:40:39+00:00,"{'totalCount': 0, 'nodes': []}",2016-01-17 23:40:20+00:00,/BetterCloud/vault-java-driver/pull/2,MERGED,Adds stubs for 'put' and 'delete' ops in REST ...,2016-01-17 23:40:39+00:00,steve-perkins,...,steve-perkins-bc,,,,,,,,,
2,CONTRIBUTOR,The withRetries() method is on the Vault class...,2016-03-01 16:37:31+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2016-03-01 04:31:45+00:00,/BetterCloud/vault-java-driver/pull/3,MERGED,Fix position of withRetries() in call chain,2016-03-01 16:37:37+00:00,happybob007,...,steve-perkins-bc,0.0,0.0,1.0,0.0,Thanks!,0.4404,1.0,0.0,0.0
3,CONTRIBUTOR,Missed a spot in my prior PR. :(,2016-03-01 16:37:46+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2016-03-01 04:35:43+00:00,/BetterCloud/vault-java-driver/pull/4,MERGED,Fix position of withRetries() in call chain,2016-03-01 16:37:51+00:00,happybob007,...,steve-perkins-bc,-0.296,0.0,0.694,0.306,Thanks!,0.4404,1.0,0.0,0.0
4,CONTRIBUTOR,Fixes #8,2016-03-20 01:07:26+00:00,"{'totalCount': 0, 'nodes': []}",2016-03-19 18:40:03+00:00,/BetterCloud/vault-java-driver/pull/9,MERGED,Keys with null values cause UnsupportedOperati...,2016-03-20 01:07:26+00:00,kunickiaj,...,steve-perkins,0.0,0.0,1.0,0.0,,,,,


------------
### Perform Sentiment Analysis for All DataFrames

In [26]:
nlp_list = []
for df in df_list:
    df2 = df.copy()
    df2 = get_bodyText(df2)
    df2 = score_comments(df2)
    nlp_list.append(df2)
print(len(nlp_list), len(df_list))

575 575


In [23]:
nlp_list[6].head()

Unnamed: 0,authorAssociation,bodyText,closedAt,comments,createdAt,resourcePath,state,title,updatedAt,author,...,mergedBy,bodyText_compound,bodyText_pos,bodyText_neu,bodyText_neg,comments_unpacked,comments_compound,comments_pos,comments_neu,comments_neg
0,CONTRIBUTOR,Allows for items to do shearing actions on she...,2012-02-14 21:15:24+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2012-02-10 16:21:34+00:00,/MinecraftForge/MinecraftForge/pull/1,CLOSED,Add a shears handler,2014-06-27 09:11:47+00:00,richardg867,...,,0.3612,0.167,0.76,0.072,Implemented in a more generic fashion f2b973e,0.0,0.0,1.0,0.0
1,CONTRIBUTOR,"Split ModLoader's ""modtick"" client profiler se...",2012-06-06 01:49:47+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2012-02-22 19:34:18+00:00,/MinecraftForge/MinecraftForge/pull/2,CLOSED,ModLoader profiling improvements,2014-06-14 14:25:04+00:00,richardg867,...,,0.0258,0.04,0.908,0.053,"I would find this useful yes, but I fear most ...",-0.3099,0.098,0.798,0.104
2,NONE,...t.,2012-02-24 20:45:41+00:00,"{'totalCount': 0, 'nodes': []}",2012-02-24 20:26:46+00:00,/MinecraftForge/MinecraftForge/pull/3,MERGED,People may think the server needs the mods to ...,2014-07-09 09:13:01+00:00,,...,MinecraftForge,0.0,0.0,0.0,0.0,,,,,
3,NONE,Fix of a small spelling error of 'frequency',2012-02-25 23:28:15+00:00,"{'totalCount': 0, 'nodes': []}",2012-02-25 18:50:08+00:00,/MinecraftForge/MinecraftForge/pull/4,MERGED,Fix spelling error,2014-08-15 22:02:56+00:00,,...,MinecraftForge,-0.4019,0.0,0.69,0.31,,,,,
4,CONTRIBUTOR,IChunkLoadHandler with register functions call...,2012-03-09 06:49:23+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2012-02-27 07:08:05+00:00,/MinecraftForge/MinecraftForge/pull/5,CLOSED,Add chunkload hooks.,2014-06-15 02:05:08+00:00,Chicken-Bones,...,,0.0258,0.068,0.932,0.0,Added,0.0,0.0,1.0,0.0


In [27]:
import pickle

with open('PR_nlp_analysis.pkl', 'wb') as f:
    pickle.dump(nlp_list, f)