# Sentiment analysis of open-source software communities

This Jupyter notebook includes the data preparation and analysis
for our project exploring open-source software communities.

**Code last updated**: 6 November 2018

***

## Table of contents

* [Preliminaries](#Preliminaries)
* [Data preparation](#Data-preparation)

***

## Preliminaries

### Load libraries and functions

In [None]:
import os, glob, string

In [None]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
from utils import annotate

***

## Data preparation

Cycle through all GitHub project files to clean data and prepare datasets as needed for analysis. 
For complete list of downloaded variables and new variables created, see `metadata.md` file.

In [None]:
# list all projects
project_list = os.listdir('../../data/raw_data')

In [None]:
# load in the lists needed
bot_list = pd.read_csv('../bot_names.txt')['bot_name']
gratitude_list = set(pd.read_csv('./utils/gratitude.txt')['expressions_of_gratitude'])

# cycle through all 
for project in project_list:
    
    # read in the next comments and issues files
    temp_comments = pd.read_csv('../../data/raw_data/'+project+'/comments.tsv',
                                sep='\t', index_col=0).sort_index()
    temp_issues = pd.read_csv('../../data/raw_data/'+project+'/issues.tsv',
                              sep='\t', index_col=0).sort_index()
    
    # append the current project to each
    temp_comments['project'] = project
    temp_issues['project'] = project
    
    # annotate each file
    temp_comments, temp_issues = annotate.annotate_logs(temp_comments,
                                                        temp_issues)
    
    # drop columns we don't need
    temp_comments = temp_comments.drop(columns=['node_id','created_at',
                                                'updated_at','author_id'])
    temp_issues = temp_issues.drop(columns=['node_id','organization',
                                          'author_id','locked'])
    
    # clean up the text body
    temp_comments = annotate.body_cleanup(temp_comments, bot_list)
    temp_issues = annotate.body_cleanup(temp_issues, bot_list)
    
    # run sentiment analysis
    temp_comments = annotate.add_sentiment(temp_comments)
    temp_issues = annotate.add_sentiment(temp_issues)
    
    # add gratitude info
    temp_comments = annotate.add_gratitude(temp_comments, gratitude_list)
    temp_issues = annotate.add_gratitude(temp_issues, gratitude_list)
    
    # save cleaned data to intermediary folders
    temp_comments.to_csv('../../data/processed_data/'+project+'-processed-comments.csv',
                         index=False, header=True)
    temp_issues.to_csv('../../data/processed_data/'+project+'-processed-issues.csv',
                         index=False, header=True)
    
    # use identical bins sizes for all histograms
    bin_number = 50    
    fig_dpi = 150
    y_label_text = 'Density'
    density_choice = True
    alpha_level = .5
    
    # create overlapping histograms for emotion in comment text
    plt.figure()
    plt.hist(temp_comments['negative_emotion'], 
             bin_number, density=density_choice, facecolor='r', alpha=alpha_level)
    plt.hist(temp_comments['positive_emotion'], 
             bin_number, density=density_choice, facecolor='g', alpha=alpha_level)
    plt.hist(temp_comments['neutral_emotion'], 
             bin_number, density=density_choice, facecolor='grey', alpha=alpha_level)
    plt.title('Histogram of emotion proportions in comment bodies\nfor '+project)
    plt.xlabel('Proportion of emotion words to total words')
    plt.ylabel(y_label_text)
    plt.grid(True)

    # plot comment emotion hisogram
    plt.savefig('../../figures/emotion_histograms/'+project+'-comment_body.png',
               dpi=fig_dpi)
    plt.close()
    
    # create overlapping histograms for emotion in issue text
    plt.figure()
    plt.hist(temp_issues['negative_emotion'], 
             bin_number, density=density_choice, facecolor='r', alpha=alpha_level)
    plt.hist(temp_issues['positive_emotion'], 
             bin_number, density=density_choice, facecolor='g', alpha=alpha_level)
    plt.hist(temp_issues['neutral_emotion'], 
             bin_number, density=density_choice, facecolor='grey', alpha=alpha_level)
    plt.title('Histogram of emotion proportions in issue bodies\nfor '+project)
    plt.xlabel('Proportion of emotion words to total words')
    plt.ylabel(y_label_text)
    plt.grid(True)

    # plot comment text
    plt.savefig('../../figures/emotion_histograms/'+project+'-issue_body.png',
               dpi=fig_dpi)
    plt.close()
    

In [None]:
# concatenate all into master file
comments_df = pd.DataFrame()
issues_df = pd.DataFrame()

***

# Code testing ground

### Data preparation

In [None]:
project = 'mayavi'

In [None]:
temp_comments = pd.read_csv('../../data/raw_data/'+project+'/comments.tsv',
                          sep='\t', index_col=0).sort_index()

In [None]:
temp_issues = pd.read_csv('../../data/raw_data/'+project+'/issues.tsv',
                          sep='\t', index_col=0).sort_index()

### Annotate the files with new columns

In [None]:
temp_comments, temp_issues = annotate.annotate_logs(temp_comments,temp_issues)

### Remove unnecessary columns

In [None]:
temp_comments = temp_comments.drop(columns=['node_id','updated_at','author_id'])

In [None]:
temp_issues = temp_issues.drop(columns=['node_id','organization','author_id','locked'])

### Clean up body

In [None]:
bot_list = pd.read_csv('../bot_names.txt')['bot_name']

In [None]:
temp_comments = annotate.body_cleanup(temp_comments, bot_list)

In [None]:
temp_issues = annotate.body_cleanup(temp_issues, bot_list)

### Sentiment analysis

In [None]:
temp_comments = annotate.add_sentiment(temp_comments)

In [None]:
temp_issues = annotate.add_sentiment(temp_issues)

### Gratitude

In [None]:
gratitude_list = set(pd.read_csv('./utils/gratitude.txt')['expressions_of_gratitude'])

In [None]:
temp_comments = annotate.add_gratitude(temp_comments, gratitude_list)

In [None]:
temp_issues = annotate.add_gratitude(temp_issues, gratitude_list)

### Plot

In [None]:
# use identical bins sizes for all histograms
bin_number = 50    
fig_dpi = 150
y_label_text = 'Density'
density_choice = True
alpha_level = .5

In [None]:
# create overlapping histograms for emotion in comment text
plt.figure()
plt.hist(temp_comments['negative_emotion'], 
         bin_number, density=density_choice, facecolor='r', alpha=alpha_level)
plt.hist(temp_comments['positive_emotion'], 
         bin_number, density=density_choice, facecolor='g', alpha=alpha_level)
plt.hist(temp_comments['neutral_emotion'], 
         bin_number, density=density_choice, facecolor='grey', alpha=alpha_level)
plt.title('Histogram of emotion proportions in comment bodies\nfor '+project)
plt.xlabel('Proportion of emotion words to total words')
plt.ylabel('Counts')
plt.grid(True)

# plot it
plt.savefig('../../figures/emotion_histograms/'+project+'.png',
           dpi=150)
plt.close()

***