### Read Pickled PR Data

In [1]:
import os
import pickle
import string

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import numpy as np
import pandas as pd

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/joseph/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
print('File count of Data/cleaned_PRs: ',
      len(os.listdir('Data/cleaned_PRs')), '\n')

File count of Data/cleaned_PRs:  750 



In [3]:
# Read all pk1 PR files from Data/cleaned_PRs
pickled_dirs = []
repo_names = []
for file in os.listdir('Data/cleaned_PRs'):
    try:
        pickled_dirs.append('Data/cleaned_PRs/'+file)
        repo_names.append(file.rstrip('_pr.pk1'))
    except:
        print('NO PULL REQUEST DATA IN: ', file)
        continue

print('File path sample: ', pickled_dirs[0])
print('Repo name sample: ', repo_names[0])

File path sample:  Data/cleaned_PRs/AE9RB-ruby-redis_pr.pk1
Repo name sample:  AE9RB-ruby-redis


In [4]:
# Unpickle PR files
df_list = [pd.read_pickle(pickled_df) for pickled_df in pickled_dirs]

In [5]:
# View a PR DataFrame
df_list[8].head()

Unnamed: 0,authorAssociation,bodyText,closedAt,comments,createdAt,resourcePath,state,title,updatedAt,author,company,files,mergedBy
0,CONTRIBUTOR,Since i got used to just setting up ENV variab...,2011-08-05 23:01:47+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-08-05 17:00:03+00:00,/AssetSync/asset_sync/pull/1,CLOSED,Parse YAML file with ERB,2014-08-10 15:56:31+00:00,kyriacos,,1,
1,NONE,"Added setting of Cache-control header (1 year,...",2011-08-22 15:51:29+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-08-15 14:23:33+00:00,/AssetSync/asset_sync/pull/2,CLOSED,Added cache-control header,2014-07-28 17:56:21+00:00,arvida,"Oktavilla, http://oktavilla.se",1,
2,CONTRIBUTOR,[BUGFIX] Add support for 'existing_remote_file...,2011-08-24 15:24:55+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2011-08-24 02:03:26+00:00,/AssetSync/asset_sync/pull/4,MERGED,[BUGFIX] Add support for 'existing_remote_file...,2014-07-22 16:39:48+00:00,jsmestad,Senior Software Architect @ Keyp GmbH,2,hamstarr
3,CONTRIBUTOR,This should fix Issue #6. Try it out and let m...,2011-10-15 14:09:44+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-10-15 04:14:50+00:00,/AssetSync/asset_sync/pull/7,MERGED,Rails 3.1.1 Compatability,2011-10-15 14:18:31+00:00,hone,Heroku,3,davidjrice
4,NONE,This modifications are tested and used to work...,2011-11-05 23:12:57+00:00,"{'totalCount': 6, 'nodes': [{'author': {'login...",2011-10-20 18:19:52+00:00,/AssetSync/asset_sync/pull/11,CLOSED,Allow to use on Engine Yard server,2014-06-14 19:45:07+00:00,edison,TudoPass,14,


---------------
### Collect Text Data & Tokenize

In [6]:
# Columns of text data
text_cols = ['bodyText', 'comments', 'title']

# Sample from df_list
sample_df = df_list[8]

sample_df[text_cols].head()

Unnamed: 0,bodyText,comments,title
0,Since i got used to just setting up ENV variab...,"{'totalCount': 2, 'nodes': [{'author': {'login...",Parse YAML file with ERB
1,"Added setting of Cache-control header (1 year,...","{'totalCount': 2, 'nodes': [{'author': {'login...",Added cache-control header
2,[BUGFIX] Add support for 'existing_remote_file...,"{'totalCount': 1, 'nodes': [{'author': {'login...",[BUGFIX] Add support for 'existing_remote_file...
3,This should fix Issue #6. Try it out and let m...,"{'totalCount': 2, 'nodes': [{'author': {'login...",Rails 3.1.1 Compatability
4,This modifications are tested and used to work...,"{'totalCount': 6, 'nodes': [{'author': {'login...",Allow to use on Engine Yard server


In [7]:
def tokenize_text(series: pd.Series):
    "Given a Pandas Series of text, return a list (for assignment) of tokenized items."
    text_obs = series.values.tolist()
    text_tokens = []

    for obs in text_obs:
        # Remove punctuation
        for punct in string.punctuation:
            obs = obs.replace(punct, '')
        obs = obs.replace('\n', ' ')

        # Tokenize sent list and concatenate
        sent = nltk.sent_tokenize(obs)
        if len(sent) != 0:
            joined_words = ' '.join(sent)
            tokens = nltk.word_tokenize(joined_words)
            text_tokens.append(joined_words)
        else:
            text_tokens.append('')
    
    return text_tokens

In [8]:
print(tokenize_text(sample_df['bodyText']))



--------------------
### Analyze PR `bodyText` using NLTK's `vader.SentimentIntensityAnalyzer`

For future reference, instead of using Vader, a NaiveBayesClassifier could be trained and used for disparate types of text data.

NLTK docs on both methods: http://www.nltk.org/howto/sentiment.html

Code tutorial for both: https://opensourceforu.com/2016/12/analysing-sentiments-nltk/

In [9]:
def get_unpacked_comments(df):
    '''Given a DataFrame, return a list of PR comments.'''
    comment_list = []
    column_list = []

    for row in df.comments:
        if (row['totalCount'] != 0):
            for com in row['nodes']:
                comment_list.append(com['bodyText'])
            column_list.append(comment_list)
            # Reset comments after each row
            comment_list = []
        else:
            column_list.append([])
            
    return column_list

def get_bodyText(df):
    bodyText_tokens = []
    bodyText_sentiment = []
    no_sentiment_data = {'neg': np.NaN, 'neu': np.NaN, 'pos': np.NaN, 'compound': np.NaN}
    
    sid = SentimentIntensityAnalyzer()

    # If there's text: calcualte polarity score, else append NaNs
    for bodyText in tokenize_text(df['bodyText']):
        if len(bodyText) > 0:
            polarity_score = sid.polarity_scores(bodyText)
            bodyText_sentiment.append(polarity_score)
        else:
            bodyText_sentiment.append(no_sentiment_data)

    # Convert bodyText_sent to pandas df
    sent_df = pd.DataFrame(bodyText_sentiment)

    # Add sentiment columns
    sentiment_df = df.copy()
    sentiment_df['bodyText_compound'] = sent_df['compound']
    sentiment_df['bodyText_pos'] = sent_df['pos']
    sentiment_df['bodyText_neu'] = sent_df['neu']
    sentiment_df['bodyText_neg'] = sent_df['neg']

    # Unpack comments
    sentiment_df['comments_unpacked'] = get_unpacked_comments(df)
    sentiment_df['comments_unpacked'] = sentiment_df['comments_unpacked'].str.join(' ')

    return sentiment_df

`sample_df` is from `df_list[6]` so the name of the repo should be the equivalent index in `repo_names`.

In [10]:
repo_names[8]

'AssetSync_asset_sync'

In [11]:
assetsync_df = get_bodyText(sample_df)

-----------
### Analyze PR `comments` Sentiment

1. Make a column containing list of comments per PR
2. Calculate sentiment for concatenated comments per PR
3. Append a column of a single sentiment value for each comment list per PR

In [12]:
assetsync_df.head()

Unnamed: 0,authorAssociation,bodyText,closedAt,comments,createdAt,resourcePath,state,title,updatedAt,author,company,files,mergedBy,bodyText_compound,bodyText_pos,bodyText_neu,bodyText_neg,comments_unpacked
0,CONTRIBUTOR,Since i got used to just setting up ENV variab...,2011-08-05 23:01:47+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-08-05 17:00:03+00:00,/AssetSync/asset_sync/pull/1,CLOSED,Parse YAML file with ERB,2014-08-10 15:56:31+00:00,kyriacos,,1,,0.6808,0.131,0.869,0.0,"Cheers, merged it into the repo on github. Cha..."
1,NONE,"Added setting of Cache-control header (1 year,...",2011-08-22 15:51:29+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-08-15 14:23:33+00:00,/AssetSync/asset_sync/pull/2,CLOSED,Added cache-control header,2014-07-28 17:56:21+00:00,arvida,"Oktavilla, http://oktavilla.se",1,,0.5859,0.138,0.862,0.0,Thanks for this. We'll be pushing out a 0.1.0 ...
2,CONTRIBUTOR,[BUGFIX] Add support for 'existing_remote_file...,2011-08-24 15:24:55+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2011-08-24 02:03:26+00:00,/AssetSync/asset_sync/pull/4,MERGED,[BUGFIX] Add support for 'existing_remote_file...,2014-07-22 16:39:48+00:00,jsmestad,Senior Software Architect @ Keyp GmbH,2,hamstarr,0.4019,0.252,0.748,0.0,Thanks dude!
3,CONTRIBUTOR,This should fix Issue #6. Try it out and let m...,2011-10-15 14:09:44+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-10-15 04:14:50+00:00,/AssetSync/asset_sync/pull/7,MERGED,Rails 3.1.1 Compatability,2011-10-15 14:18:31+00:00,hone,Heroku,3,davidjrice,0.0,0.0,1.0,0.0,Have added it to a test rails 3.1.1 app. Looks...
4,NONE,This modifications are tested and used to work...,2011-11-05 23:12:57+00:00,"{'totalCount': 6, 'nodes': [{'author': {'login...",2011-10-20 18:19:52+00:00,/AssetSync/asset_sync/pull/11,CLOSED,Allow to use on Engine Yard server,2014-06-14 19:45:07+00:00,edison,TudoPass,14,,0.128,0.021,0.951,0.028,"Hey Edison, thanks for this. Would be happy to..."


In [13]:
def score_comments(df):
    """Given a DataFrame with a `comments_unpacked` column, 
       return a df with comment sentiment analysis scores."""
    empty_comments = []
    commentText_sentiment = []
    no_sentiment_data = {'neg': np.NaN, 'neu': np.NaN, 'pos': np.NaN, 'compound': np.NaN}
    
    sid = SentimentIntensityAnalyzer()
    
    for ind, text in enumerate(tokenize_text(df['comments_unpacked'])):
        if len(text) == 0:
            empty_comments.append(ind)
            commentText_sentiment.append(no_sentiment_data)
        else:
            try:
                polarity_score = sid.polarity_scores(text)
                commentText_sentiment.append(polarity_score)
            except:
                commentText_sentiment.append(no_sentiment_data)
                print('Could not calculate Polarity score for: ', text)
                print('Appended NaNs')
    
    
    sent_df = pd.DataFrame(commentText_sentiment)
    df['comments_compound'] = sent_df['compound']
    df['comments_pos'] = sent_df['pos']
    df['comments_neu'] = sent_df['neu']
    df['comments_neg'] = sent_df['neg']
    
    print('No comments at indices: ', empty_comments)
    
    return df

In [14]:
assetsync_df = score_comments(assetsync_df)

No comments at indices:  [5, 10, 12, 15, 17, 21, 32, 33, 39, 46, 49, 50, 54, 55, 57, 58, 62, 65, 69, 72, 76, 78, 79, 80, 82, 86, 91, 93, 95, 97, 98, 99, 100, 101, 102, 103, 104, 109, 142, 144, 152]


In [15]:
assetsync_df.head()

Unnamed: 0,authorAssociation,bodyText,closedAt,comments,createdAt,resourcePath,state,title,updatedAt,author,...,mergedBy,bodyText_compound,bodyText_pos,bodyText_neu,bodyText_neg,comments_unpacked,comments_compound,comments_pos,comments_neu,comments_neg
0,CONTRIBUTOR,Since i got used to just setting up ENV variab...,2011-08-05 23:01:47+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-08-05 17:00:03+00:00,/AssetSync/asset_sync/pull/1,CLOSED,Parse YAML file with ERB,2014-08-10 15:56:31+00:00,kyriacos,...,,0.6808,0.131,0.869,0.0,"Cheers, merged it into the repo on github. Cha...",0.9686,0.082,0.918,0.0
1,NONE,"Added setting of Cache-control header (1 year,...",2011-08-22 15:51:29+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-08-15 14:23:33+00:00,/AssetSync/asset_sync/pull/2,CLOSED,Added cache-control header,2014-07-28 17:56:21+00:00,arvida,...,,0.5859,0.138,0.862,0.0,Thanks for this. We'll be pushing out a 0.1.0 ...,0.8442,0.453,0.547,0.0
2,CONTRIBUTOR,[BUGFIX] Add support for 'existing_remote_file...,2011-08-24 15:24:55+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2011-08-24 02:03:26+00:00,/AssetSync/asset_sync/pull/4,MERGED,[BUGFIX] Add support for 'existing_remote_file...,2014-07-22 16:39:48+00:00,jsmestad,...,hamstarr,0.4019,0.252,0.748,0.0,Thanks dude!,0.4404,0.744,0.256,0.0
3,CONTRIBUTOR,This should fix Issue #6. Try it out and let m...,2011-10-15 14:09:44+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-10-15 04:14:50+00:00,/AssetSync/asset_sync/pull/7,MERGED,Rails 3.1.1 Compatability,2011-10-15 14:18:31+00:00,hone,...,davidjrice,0.0,0.0,1.0,0.0,Have added it to a test rails 3.1.1 app. Looks...,0.7906,0.178,0.822,0.0
4,NONE,This modifications are tested and used to work...,2011-11-05 23:12:57+00:00,"{'totalCount': 6, 'nodes': [{'author': {'login...",2011-10-20 18:19:52+00:00,/AssetSync/asset_sync/pull/11,CLOSED,Allow to use on Engine Yard server,2014-06-14 19:45:07+00:00,edison,...,,0.128,0.021,0.951,0.028,"Hey Edison, thanks for this. Would be happy to...",0.9945,0.145,0.82,0.036


------------
### Perform Sentiment Analysis for All DataFrames

In [16]:
nlp_list = []
for df in df_list:
    df_copy = df.copy()
    df_copy = get_bodyText(df_copy)
    df_copy = score_comments(df_copy)
    nlp_list.append(df_copy)

print(len(nlp_list), len(df_list))

No comments at indices:  [0]
No comments at indices:  [0, 3, 4, 6, 7, 9]
No comments at indices:  [2, 4, 7, 12, 16, 17, 22, 23, 25, 27, 28, 33, 37, 39, 48, 49, 53, 54, 56, 57, 59, 60, 61, 68, 69, 71, 79, 81, 82, 83, 90, 91, 93, 94, 95, 98, 99, 101, 104, 105, 106, 107, 108]
No comments at indices:  [2, 3, 4, 6, 9, 12]
No comments at indices:  [0, 2]
No comments at indices:  [0, 2]
No comments at indices:  [0, 2, 3, 5, 8]
No comments at indices:  [4]
No comments at indices:  [5, 10, 12, 15, 17, 21, 32, 33, 39, 46, 49, 50, 54, 55, 57, 58, 62, 65, 69, 72, 76, 78, 79, 80, 82, 86, 91, 93, 95, 97, 98, 99, 100, 101, 102, 103, 104, 109, 142, 144, 152]
No comments at indices:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 24, 26, 27, 28, 29, 31, 33, 36, 37, 38, 39, 40, 42, 45, 46, 48, 51, 52, 55, 57, 60, 61, 64, 67, 68, 70, 71, 72, 73, 77, 78, 79, 80, 82, 83, 86, 87, 88, 89, 90, 91, 96, 97, 99, 104, 107, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119]
No comments at indices:  [26,

In [17]:
nlp_list[8].head()

Unnamed: 0,authorAssociation,bodyText,closedAt,comments,createdAt,resourcePath,state,title,updatedAt,author,...,mergedBy,bodyText_compound,bodyText_pos,bodyText_neu,bodyText_neg,comments_unpacked,comments_compound,comments_pos,comments_neu,comments_neg
0,CONTRIBUTOR,Since i got used to just setting up ENV variab...,2011-08-05 23:01:47+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-08-05 17:00:03+00:00,/AssetSync/asset_sync/pull/1,CLOSED,Parse YAML file with ERB,2014-08-10 15:56:31+00:00,kyriacos,...,,0.6808,0.131,0.869,0.0,"Cheers, merged it into the repo on github. Cha...",0.9686,0.082,0.918,0.0
1,NONE,"Added setting of Cache-control header (1 year,...",2011-08-22 15:51:29+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-08-15 14:23:33+00:00,/AssetSync/asset_sync/pull/2,CLOSED,Added cache-control header,2014-07-28 17:56:21+00:00,arvida,...,,0.5859,0.138,0.862,0.0,Thanks for this. We'll be pushing out a 0.1.0 ...,0.8442,0.453,0.547,0.0
2,CONTRIBUTOR,[BUGFIX] Add support for 'existing_remote_file...,2011-08-24 15:24:55+00:00,"{'totalCount': 1, 'nodes': [{'author': {'login...",2011-08-24 02:03:26+00:00,/AssetSync/asset_sync/pull/4,MERGED,[BUGFIX] Add support for 'existing_remote_file...,2014-07-22 16:39:48+00:00,jsmestad,...,hamstarr,0.4019,0.252,0.748,0.0,Thanks dude!,0.4404,0.744,0.256,0.0
3,CONTRIBUTOR,This should fix Issue #6. Try it out and let m...,2011-10-15 14:09:44+00:00,"{'totalCount': 2, 'nodes': [{'author': {'login...",2011-10-15 04:14:50+00:00,/AssetSync/asset_sync/pull/7,MERGED,Rails 3.1.1 Compatability,2011-10-15 14:18:31+00:00,hone,...,davidjrice,0.0,0.0,1.0,0.0,Have added it to a test rails 3.1.1 app. Looks...,0.7906,0.178,0.822,0.0
4,NONE,This modifications are tested and used to work...,2011-11-05 23:12:57+00:00,"{'totalCount': 6, 'nodes': [{'author': {'login...",2011-10-20 18:19:52+00:00,/AssetSync/asset_sync/pull/11,CLOSED,Allow to use on Engine Yard server,2014-06-14 19:45:07+00:00,edison,...,,0.128,0.021,0.951,0.028,"Hey Edison, thanks for this. Would be happy to...",0.9945,0.145,0.82,0.036


In [18]:
with open('PR_nlp_analysis.pkl', 'wb') as f:
    pickle.dump(nlp_list, f)