https://www.kaggle.com/ash316/novice-to-grandmaster
# Introduction

In [96]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import squarify
import warnings
import numpy as np
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import base64
import io
from scipy.misc import imread
import codecs
from IPython.display import HTML
from matplotlib_venn import venn2
from subprocess import check_output

plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')
py.init_notebook_mode(connected = True)

In [97]:
response = pd.read_csv('input/multipleChoiceResponses.csv', encoding = 'ISO-8859-1')

In [98]:
response.head()

Unnamed: 0,GenderSelect,Country,Age,EmploymentStatus,StudentStatus,LearningDataScience,CodeWriter,CareerSwitcher,CurrentJobTitleSelect,TitleFit,...,JobFactorExperienceLevel,JobFactorDepartment,JobFactorTitle,JobFactorCompanyFunding,JobFactorImpact,JobFactorRemote,JobFactorIndustry,JobFactorLeaderReputation,JobFactorDiversity,JobFactorPublishingOpportunity
0,"Non-binary, genderqueer, or gender non-conforming",,,Employed full-time,,,Yes,,DBA/Database Engineer,Fine,...,,,,,,,,,,
1,Female,United States,30.0,"Not employed, but looking for work",,,,,,,...,,,,,,,,Somewhat important,,
2,Male,Canada,28.0,"Not employed, but looking for work",,,,,,,...,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important
3,Male,United States,56.0,"Independent contractor, freelancer, or self-em...",,,Yes,,Operations Research Practitioner,Poorly,...,,,,,,,,,,
4,Male,Taiwan,38.0,Employed full-time,,,Yes,,Computer Scientist,Fine,...,,,,,,,,,,


# Some Basic Analysis

In [99]:
print('The total number of respondents:', response.shape[0])
print('Total number of Countries with respondents:', response['Country'].nunique())
response_counts_by_country = response['Country'].value_counts()
print('Country with highest respondents:', response_counts_by_country.index[0],'with', response_counts_by_country.values[0], 'respondents')
print('Youngest respondent:', response['Age'].min(), ' and Oldest respondent:', response['Age'].max())

The total number of respondents: 16716
Total number of Countries with respondents: 52
Country with highest respondents: United States with 4197 respondents
Youngest respondent: 0.0  and Oldest respondent: 100.0


# Gender Split

In [100]:
plt.subplots(figsize = (22, 12))
sns.countplot(y = response['GenderSelect'], order = response['GenderSelect'].value_counts().index)
plt.show()

# Respondents By Country

In [101]:
top15_countries = response_counts_by_country[:15].to_frame()
sns.barplot(top15_countries['Country'], top15_countries.index, palette = 'inferno')
plt.title('Top 15 Countries by # of responsdents')
plt.xlabel('')
fig = plt.gcf()
fig.set_size_inches(10, 10)
plt.show()

all_countries = response_counts_by_country.to_frame()
squarify.plot(sizes = all_countries['Country'].values, label = all_countries.index, color = sns.color_palette('RdYlGn_r', 52))
plt.rcParams.update({'font.size': 20})
plt.title('# of respondents from All Countries')
fig = plt.gcf()
fig.set_size_inches(40, 15)
plt.show()

# Compensation

In [102]:
response['CompensationAmount'] = response['CompensationAmount'].str.replace(',', '')
response['CompensationAmount'] = response['CompensationAmount'].str.replace('-', '')
rates = pd.read_csv('input/conversionRates.csv')
rates.drop('Unnamed: 0', axis = 1, inplace = True)

salary = response[['CompensationAmount', 'CompensationCurrency', 'GenderSelect', 'Country', 'CurrentJobTitleSelect']].dropna()
salary = salary.merge(rates, left_on = 'CompensationCurrency', right_on = 'originCountry', how = 'left')
salary['Salary'] = pd.to_numeric(salary['CompensationAmount']) * salary['exchangeRate']
valid_salary = salary['Salary'].dropna().astype(int)

print('Maximum Salary is USD $', valid_salary.max())
print('Minimum Salary is USD $', valid_salary.min())
print('Median Salary is USD $', valid_salary.median())

Maximum Salary is USD $ 208999999
Minimum Salary is USD $ -2147483648
Median Salary is USD $ 53812.0


In [103]:
plt.subplots(figsize = (15, 8))
valid_salary = salary[(salary['Salary'] < 1000000) & (salary['Salary'] >= 0)]
sns.distplot(valid_salary['Salary'])
plt.title('Salary Distribution', size = 15)
plt.show()

# Compensation by Country

In [104]:
f, ax = plt.subplots(1, 2, figsize = (18, 8))
median_salary_by_country = salary.groupby('Country')['Salary'].median()
top15_median_salary_by_country = median_salary_by_country.sort_values(ascending = False)[:15].to_frame()
sns.barplot('Salary', top15_median_salary_by_country.index, data = top15_median_salary_by_country, palette = 'RdYlGn', ax = ax[0])
ax[0].axvline(salary['Salary'].median(), linestyle = 'dashed')
ax[0].set_title('Top 15 Highest Salary Paying Countries')
ax[0].set_xlabel('')

median_salary_by_country_df = median_salary_by_country.to_frame()
max_respondents_countries_salary = median_salary_by_country_df[median_salary_by_country_df.index.isin(top15_countries.index)]
max_respondents_countries_salary.sort_values(by = 'Salary', ascending = True).plot.barh(width = 0.8, ax = ax[1], color = sns.color_palette('RdYlGn'))
ax[1].axvline(salary['Salary'].median(), linestyle='dashed')
ax[1].set_title('Compensation of Top 15 Respondent Countries')
ax[1].set_xlabel('')
ax[1].set_ylabel('')
plt.subplots_adjust(wspace = 0.8)
plt.show()

# Salary by Gender

In [105]:
plt.subplots(figsize = (10, 8))
sns.boxplot(y = 'GenderSelect', x = 'Salary', data = valid_salary)
plt.ylabel('')
plt.show()

# Age

In [106]:
plt.subplots(figsize = (15, 8))
response['Age'].hist(bins = 50, edgecolor = 'black')
plt.xticks(list(range(0, 80, 5)))
plt.title('Age Distribution')
plt.show()

# Profession & Major

In [107]:
f, ax = plt.subplots(1, 2, figsize = (25, 15))
sns.countplot(y = response['MajorSelect'], ax = ax[0], order = response['MajorSelect'].value_counts().index)
ax[0].set_title('Major')
ax[0].set_ylabel('')
sns.countplot(y = response['CurrentJobTitleSelect'], ax = ax[1], order = response['CurrentJobTitleSelect'].value_counts().index)
ax[1].set_title('Current Job')
ax[1].set_ylabel('')
plt.subplots_adjust(wspace = 0.8)
plt.show()

In [108]:
# Compensation by Job Title
salary_by_job = valid_salary.groupby('CurrentJobTitleSelect')['Salary'].median().to_frame().sort_values(by = 'Salary', ascending = False)
ax = sns.barplot(salary_by_job.Salary, salary_by_job.index, palette = sns.color_palette('inferno', 20))
plt.title('Compensation by Job Title', size = 15)
for i, v in enumerate(salary_by_job.Salary):
    ax.text(0.5, i, v, fontsize = 10, color = 'white', weight = 'bold')

fig = plt.gcf()
fig.set_size_inches(8, 8)
plt.show()

# Machine Learning

In [109]:
f, ax = plt.subplots(1, 2, figsize = (25, 12))
skills = response['MLSkillsSelect'].str.split(',')
skills_set = []
for i in skills.dropna():
    skills_set.extend(i)
    
skills_plot = pd.Series(skills_set).value_counts().sort_values(ascending = False).to_frame()
sns.barplot(skills_plot[0], skills_plot.index, ax = ax[0], palette = sns.color_palette('inferno_r', 15))
ax[0].set_title('ML Skills')
tech = response['MLTechniquesSelect'].str.split(',')
techniques = []
for i in tech.dropna():
    techniques.extend(i)
    
techniques_plot = pd.Series(techniques).value_counts().sort_values(ascending = False).to_frame()
sns.barplot(techniques_plot[0], techniques_plot.index, ax = ax[1], palette = sns.color_palette('inferno_r', 15))
ax[1].set_title('ML Techniques used')
plt.subplots_adjust(wspace = 0.8)
plt.show()

In [110]:
f, ax = plt.subplots(1, 2, figsize = (25, 12))
ml_nextyear = response['MLMethodNextYearSelect'].str.split(',')
method_nextyear = []
for i in ml_nextyear.dropna():
    method_nextyear.extend(i)
    
pd.Series(method_nextyear).value_counts()[:15].sort_values(ascending = True).plot.barh(width = 0.9, color = sns.color_palette('winter_r', 15), ax = ax[0])

tool = response['MLToolNextYearSelect'].str.split(',')
tool_nextyear = []
for i in tool.dropna():
    tool_nextyear.extend(i)
pd.Series(tool_nextyear).value_counts()[:15].sort_values(ascending = True).plot.barh(width = 0.9, color = sns.color_palette('winter_r', 15), ax = ax[1])
plt.subplots_adjust(wspace = 0.8)
ax[0].set_title('ML Method Next Year')
ax[1].set_title('ML Tool Next Year')
plt.show()

# Best Platforms to Learn

In [111]:
plt.subplots(figsize = (6, 8))
learn = response['LearningPlatformSelect'].str.split(',')
platforms = []
for i in learn.dropna():
    platforms.extend(i)
    
pd.Series(platforms).value_counts()[:15].sort_values(ascending = True).plot.barh(width = 0.9, color = sns.color_palette('winter', 15))
plt.title('Best Platforms to Learn', size = 15)
plt.show()

# Hardware Used

In [112]:
plt.subplots(figsize = (10, 10))
hard = response['HardwarePersonalProjectsSelect'].str.split(',')
hardwares = []
for i in hard.dropna():
    hardwares.extend(i)
    
pd.Series(hardwares).value_counts().sort_values(ascending = True).plot.barh(width = 0.9, color = sns.color_palette('inferno', 10))
plt.title('Machines Used')
plt.show()

# Where do I get Datasets from?

In [113]:
plt.subplots(figsize = (15, 15))
data = response['PublicDatasetsSelect'].str.split(',')
datasets = []
for i in data.dropna():
    datasets.extend(i)
    
pd.Series(datasets).value_counts().plot.pie(autopct = '%.1f%%', colors = sns.color_palette('Paired', 10), startangle = 90, wedgeprops = {'linewidth': 2, 'edgecolor': 'white'})
plt.title('Dataset Source')
my_circle = plt.Circle((0, 0), 0.7, color = 'white')
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.ylabel('')
plt.show()

# Code Sharing

In [114]:
plt.subplots(figsize = (15, 15))
code = response['WorkCodeSharing'].str.split(',')
code_shares = []
for i in code.dropna():
    code_shares.extend(i)
    
pd.Series(code_shares).value_counts().plot.pie(autopct = '%.1f%%', shadow = True, colors = sns.color_palette('Set3', 10), startangle = 90, wedgeprops = {'linewidth': 2, 'edgecolor': 'white'})
plt.title('Code Sharing Medium')
my_circle = plt.Circle((0, 0), 0.65, color = 'white')
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.ylabel('')
plt.show()

# Challenges in Data Science

In [115]:
plt.subplots(figsize = (15, 18))
challenge = response['WorkChallengesSelect'].str.split(',')
challenges = []
for i in challenge.dropna():
    challenges.extend(i)

challenges_df = pd.Series(challenges).value_counts().sort_values(ascending = False).to_frame()
sns.barplot(challenges_df[0], challenges_df.index, palette = sns.color_palette('inferno', 25))
plt.title('Challenges in Data Science')
plt.show()

# Job Satisfaction

In [116]:
satisfy = response.copy()
satisfy['JobSatisfaction'].replace({'10 - Highly Satisfied': '10', '1 - Highly Dissatisfied': '1', 'I prefer not to share': np.NaN}, inplace = True)
satisfy.dropna(subset = ['JobSatisfaction'], inplace = True)
satisfy['JobSatisfaction'] = satisfy['JobSatisfaction'].astype(int)
mean_satisfaction_by_job = satisfy.groupby(['CurrentJobTitleSelect'])['JobSatisfaction'].mean().sort_values(ascending = False).to_frame()
ax = sns.barplot(y = mean_satisfaction_by_job.index, x = mean_satisfaction_by_job.JobSatisfaction, palette = sns.color_palette('inferno', 20))
fig = plt.gcf()
fig.set_size_inches(8, 10)
for i, v in enumerate(mean_satisfaction_by_job.JobSatisfaction):
    ax.text(.1, i, v, fontsize = 10, color = 'white', weight = 'bold')
plt.title('Job Satisfaction 1 (Highly Dissatisfied) ~10 (Highly Satisfied)')
plt.show()

## Job Satisfication by Country

In [117]:
mean_satisfaction_by_country = satisfy.groupby(['Country'])['JobSatisfaction'].mean().sort_values(ascending = True).to_frame()
data = [
    dict(
        type = 'choropleth',
        autocolorscale = False,
        colorscale = 'Viridis',
        reversescale = True,
        showscale = True,
        locations = mean_satisfaction_by_country.index,
        z = mean_satisfaction_by_country['JobSatisfaction'],
        locationmode = 'country names',
        text = mean_satisfaction_by_country['JobSatisfaction'],
        marker = dict(line = dict(color = 'rgb(200, 200, 200)', width = 0.5)),
        colorbar = dict(autotick = True, tickprefix = '', title = 'Satisfaction')
    )
]

layout = dict(
    title = 'Job Satisfaction by Country',
    geo = dict(
        showframe = True,
        showocean = True,
        oceancolor = 'rgb(0, 0, 255)',
        projection = dict(
            type = 'chloropleth',
        ),
        lonaxis = dict(
            showgrid = False,
            gridcolor = 'rgb(102, 102, 102)'
        ),
        lataxis = dict(
            showgrid = False,
            gridcolor = 'rgb(102, 102, 102)'
        )
    ),
)

fig = dict(data = data, layout = layout)
py.iplot(fig, validate = False, filename = 'worldmap2010')

# Python vs R

In [118]:
resp = response.dropna(subset = ['WorkToolsSelect'])
resp = resp.merge(rates, left_on = 'CompensationCurrency', right_on = 'originCountry', how = 'left')
python = resp[(resp['WorkToolsSelect'].str.contains('Python')) & (~resp['WorkToolsSelect'].str.contains('R'))]
R = resp[(~resp['WorkToolsSelect'].str.contains('Python')) & (resp['WorkToolsSelect'].str.contains('R'))]
both = resp[(resp['WorkToolsSelect'].str.contains('Python')) & (resp['WorkToolsSelect'].str.contains('R'))]

# Recommended Language for Beginners

In [119]:
response['LanguageRecommendationSelect'].value_counts()[:2].plot.bar()
plt.show()

## Recommendation by Python and R users

In [120]:
labels_Python = python['LanguageRecommendationSelect'].value_counts()[:5].index
sizes_Python = python['LanguageRecommendationSelect'].value_counts()[:5].values

labels_R = R['LanguageRecommendationSelect'].value_counts()[:5].index
sizes_R = R['LanguageRecommendationSelect'].value_counts()[:5].values

fig = {
  "data": [
    {
      "values": sizes_Python,
      "labels": labels_Python,
      "domain": {"x": [0, .48]},
      "name": "Language",
      "hoverinfo":"label+percent+name",
      "hole": .4,
      "type": "pie"
    },     
    {
      "values": sizes_R,
      "labels": labels_R,
      "text":"CO2",
      "textposition":"inside",
      "domain": {"x": [.54, 1]},
      "name": "Language",
      "hoverinfo":"label+percent+name",
      "hole": .4,
      "type": "pie"
    }
  ],
  "layout": {
        "title":"Language Recommended By Python and R users",
        "annotations": [
            {
                "font": {
                    "size": 30
                },
                "showarrow": False,
                "text": "Python",
                "x": 0.17,
                "y": 0.5
            },
            {
                "font": {
                    "size": 30
                },
                "showarrow": False,
                "text": "R",
                "x": 0.79,
                "y": 0.5
            }
        ]
    }
}

py.iplot(fig, filename='donut')

## Necessary or Not?

In [121]:
f, ax = plt.subplots(1, 2, figsize = (18, 8))

response['JobSkillImportancePython'].value_counts().plot.pie(ax = ax[0], autopct = '%1.1f%%', explode = [0.1, 0, 0], shadow = True, colors = ['g', 'lightblue', 'r'])
ax[0].set_title('Python Necessity')
ax[0].set_ylabel('')

response['JobSkillImportanceR'].value_counts().plot.pie(ax = ax[1], autopct = '%1.1f%%', explode = [0, 0.1, 0], shadow = True, colors = ['lightblue', 'g', 'r'])
ax[1].set_title('R Necessity')
ax[1].set_ylabel('')

plt.show()

## Number of Users by Language

In [122]:
f, ax = plt.subplots(1, 2, figsize = (18, 8))
pd.Series([python.shape[0], R.shape[0], both.shape[0]], index = ['Python', 'R', 'Both']).plot.bar(ax = ax[0])
ax[0].set_title('# of Users')

venn2(subsets = (python.shape[0], R.shape[0], both.shape[0]), set_labels = ('Python Users', 'R Users'))
plt.title('Venn Diagram for Users')
plt.show()

# Compensation

In [123]:
py_salary = (pd.to_numeric(python['CompensationAmount'].dropna()) * python['exchangeRate']).dropna()
py_salary = py_salary[py_salary < 1000000]
R_salary = (pd.to_numeric(R['CompensationAmount'].dropna()) * R['exchangeRate']).dropna()
R_salary = R_salary[R_salary < 1000000]
both_salary = (pd.to_numeric(both['CompensationAmount'].dropna()) * both['exchangeRate']).dropna()
both_salary = both_salary[both_salary < 1000000]

all_salaries = pd.DataFrame([py_salary, R_salary, both_salary])
all_salaries = all_salaries.transpose()
all_salaries.columns = [ 'Python', 'R', 'Both' ]
print('Median Salary for Individual using Python only: ', all_salaries['Python'].median())
print('Median Salary for Individual using R only: ', all_salaries['R'].median())
print('Median Salary for Individual using both Python and R: ', all_salaries['Both'].median())

Median Salary for Individual using Python only:  48725.600000000006
Median Salary for Individual using R only:  48421.99999999999
Median Salary for Individual using both Python and R:  59791.3


In [124]:
all_salaries.plot.box()
plt.title('Compensation by Language')
fig = plt.gcf()
fig.set_size_inches(10, 6)
plt.show()

# Language Used by Professionals

In [125]:
py_copy = python.copy()
r_copy = R.copy()
both_copy = both.copy()
py_copy['WorkToolsSelect'] = 'Python'
r_copy['WorkToolsSelect'] = 'R'
both_copy['WorkToolsSelect'] = 'Both'
r_vs_py = pd.concat([ py_copy, r_copy, both_copy ])
r_vs_py = r_vs_py.groupby(['CurrentJobTitleSelect', 'WorkToolsSelect'])['Age'].count().to_frame().reset_index()
r_vs_py.pivot('CurrentJobTitleSelect', 'WorkToolsSelect', 'Age').plot.barh(width = 0.8)
fig = plt.gcf()
fig.set_size_inches(10, 15)
plt.title('Job Title vs Language Used', size = 15)
plt.show()

# Job Function vs Language

In [126]:
r_vs_py = pd.concat([ py_copy, r_copy, both_copy ])
r_vs_py = r_vs_py.groupby(['JobFunctionSelect', 'WorkToolsSelect'])['Age'].count().to_frame().reset_index()
r_vs_py.pivot('JobFunctionSelect', 'WorkToolsSelect', 'Age').plot.barh(width = 0.8)
fig = plt.gcf()
fig.set_size_inches(10, 15)
plt.title('Job Description vs Language Used')
plt.show()

# Tenure vs Language Used

In [127]:
r_vs_py = pd.concat([ py_copy, r_copy, both_copy ])
r_vs_py = r_vs_py.groupby(['Tenure', 'WorkToolsSelect'])['Age'].count().to_frame().reset_index()
r_vs_py.pivot('Tenure', 'WorkToolsSelect', 'Age').plot.barh(width = 0.8)
fig = plt.gcf()
fig.set_size_inches(10, 10)
plt.title('Job Tenure vs Language Used', size = 15)
plt.show()

# Industry vs Language Used

In [128]:
r_vs_py = pd.concat([ py_copy, r_copy, both_copy ])
r_vs_py = r_vs_py.groupby(['EmployerIndustry', 'WorkToolsSelect'])['Age'].count().to_frame().reset_index()
r_vs_py.pivot('EmployerIndustry', 'WorkToolsSelect', 'Age').plot.barh(width = 0.8)
fig = plt.gcf()
fig.set_size_inches(10, 10)
plt.title('Industry vs Language Used')
plt.show()

# Common Tools with Python and R

In [129]:
f, ax = plt.subplots(1, 2, figsize = (20, 15))
py_tools = python['WorkToolsSelect'].str.split(',')
py_tools_list = []
for i in py_tools:
    py_tools_list.extend(i)

py_plot = pd.Series(py_tools_list).value_counts()[1:15].sort_values(ascending = False).to_frame()
sns.barplot(py_plot[0], py_plot.index, ax = ax[0], palette = sns.color_palette('inferno_r', 15))
ax[0].set_title('Commonly Used Tools with Python')

R_tools = R['WorkToolsSelect'].str.split(',')
R_tools_list = []
for i in R_tools:
    R_tools_list.extend(i)

R_plot = pd.Series(R_tools_list).value_counts()[1:15].sort_values(ascending = False).to_frame()
sns.barplot(R_plot[0], R_plot.index, ax = ax[1], palette = sns.color_palette('inferno_r', 15))
ax[1].set_title('Commonly Used Tools with R')

plt.subplots_adjust(wspace = 0.8)
plt.show()

# Asking the Data Scientists

In [130]:
response['DataScienceIdentitySelect'].value_counts()

No                        5314
Yes                       4257
Sort of (Explain more)    3100
Name: DataScienceIdentitySelect, dtype: int64

## Current Job Titles

In [131]:
plt.subplots(figsize = (10, 8))
scientist = response[response['DataScienceIdentitySelect'] == 'Yes']
scientist['CurrentJobTitleSelect'].value_counts().sort_values(ascending = True).plot.barh(width = 0.9, color = sns.color_palette('inferno', 15))
plt.title('Job Titles', size = 15)
plt.show()

In [132]:
true_scientists = response[response['CurrentJobTitleSelect'] == 'Data Scientist']
# True scientists haven't answered the question above
true_scientists['DataScienceIdentitySelect'].value_counts()

Series([], Name: DataScienceIdentitySelect, dtype: int64)

In [133]:
all_scientist = pd.concat([scientist, true_scientists])
all_scientist['CurrentJobTitleSelect'].shape[0] # About 40% of total respondents are data scientists or have skills for the same

6690

## Country-Wise Split

In [135]:
plt.subplots(figsize = (10, 0))
scientist_country = all_scientist['Country'].value_counts()[:15].sort_values(ascending = False).to_frame()
sns.barplot(scientist_country.Country, scientist_country.index, palette = 'inferno')
plt.title('Countries by # of Data Scientists', size = 15)
plt.show()

## Employment Status & Education

In [137]:
f, ax = plt.subplots(1, 2, figsize = (25, 10))
sns.countplot(y = all_scientist['EmploymentStatus'], ax = ax[0])
ax[0].set_title('Employment Status')
ax[0].set_ylabel('')

sns.countplot(y = all_scientist['FormalEducation'], order = scientist['FormalEducation'].value_counts().index, ax = ax[1], palette = sns.color_palette('viridis_r', 15))
ax[1].set_title('Formal Education')
ax[1].set_ylabel('')

plt.subplots_adjust(wspace = 0.8)
plt.show()

## Compensation by Formal Education

In [155]:
plt.subplots(figsize = (15, 8))
comp_edu = all_scientist.merge(salary, left_index = True, right_index = True, how = 'left')
comp_edu = comp_edu[comp_edu.Salary.notnull()][comp_edu.Salary <= 1000000][['FormalEducation', 'Salary']]
sns.boxplot(x = 'FormalEducation', y = 'Salary', data = comp_edu)
plt.title('Compensation vs Education')
plt.xticks(rotation = 90)
plt.ylim()
plt.show()

## Previous Job and Salary Change

In [175]:
f, ax = plt.subplots(1, 2, figsize = (24, 13))
past = all_scientist['PastJobTitlesSelect'].str.split(',')
past_job = []
for i in past.dropna():
    past_job.extend(i)
    
pd.Series(past_job).value_counts().sort_values(ascending = True).plot.barh(width = 0.9, color = sns.color_palette('summer', 25), ax = ax[0])
ax[0].set_title('Previous Job')

salary = all_scientist['SalaryChange'].str.split(',')
salary_change = []
for i in salary.dropna():
    salary_change.extend(i)
    
pd.Series(salary_change).value_counts().sort_values(ascending = True).plot.barh(width = 0.8, color = sns.color_palette('summer', 10), ax = ax[1])
ax[1].set_title('Salary Change')

#plt.tight_layout()
plt.show()

## Tools used at Work

In [158]:
plt.subplots(figsize = (8, 8))
tools = all_scientist['WorkToolsSelect'].str.split(',')
tools_work = []
for i in tools.dropna():
    tools_work.extend(i)
    
pd.Series(tools_work).value_counts()[:15].sort_values(ascending = True).plot.barh(width = 0.9, color = sns.color_palette('RdYlGn', 15))
plt.show()

## Where did they Learn from?

In [161]:
course = all_scientist['CoursePlatformSelect'].str.split(',')
course_plat = []
for i in course.dropna():
    course_plat.extend(i)
    
course_plat = pd.Series(course_plat).value_counts()

blogs = all_scientist['BlogsPodcastsNewslettersSelect'].str.split(',')
blogs_fam = []
for i in blogs.dropna():
    blogs_fam.extend(i)
    
blogs_fam = pd.Series(blogs_fam).value_counts()
labels1 = course_plat.index
sizes1 = course_plat.values
labels2 = blogs_fam.index
sizes2 = blogs_fam.values

fig = {
  "data": [
    {
      "values": sizes1,
      "labels": labels1,
      "domain": {"x": [0, .48]},
      "name": "MOOC",
      "hoverinfo":"label+percent+name",
      "hole": .4,
      "type": "pie"
    },     
    {
      "values": sizes2,
      "labels": labels2,
      "text":"CO2",
      "textposition":"inside",
      "domain": {"x": [.54, 1]},
      "name": "Blog",
      "hoverinfo":"label+percent+name",
      "hole": .4,
      "type": "pie"
    }
  ],
  "layout": {
        "title":"Blogs and Online Platforms",
        "showlegend":True,
        "annotations": [
          {
            "font": {
              "size": 12
            },
            "showarrow": False,
            "text": "MOOC's",
            "x": 0.18,
            "y": 0.5
          },
          {
            "font": {
              "size": 12
            },
            "showarrow": False,
            "text": "BLOGS",
            "x": 0.83,
            "y": 0.5
          }
        ]
  }
}

py.iplot(fig, filename='donut')

## Time spent on Tasks

In [164]:
import itertools

plt.subplots(figsize = (22, 10))
time_spent = ['TimeFindingInsights', 'TimeVisualizing', 'TimeGatheringData', 'TimeModelBuilding']
length = len(time_spent)
for i, j in itertools.zip_longest(time_spent, range(length)):
    plt.subplot((length / 2), 2, j+1)
    plt.subplots_adjust(wspace = 0.2, hspace = 0.5)
    all_scientist[i].hist(bins = 10, edgecolor = 'black')
    plt.axvline(all_scientist[i].mean(), linestyle = 'dashed', color = 'r')
    plt.title(i, size = 20)
    plt.xlabel('% Time')
    
plt.show()

## Cloud Services

In [165]:
cloud=['WorkToolsFrequencyAmazonML', 'WorkToolsFrequencyAWS', 'WorkToolsFrequencyCloudera', 'WorkToolsFrequencyHadoop', 'WorkToolsFrequencyAzure']
plt.subplots(figsize = (30, 15))
length = len(cloud)

for i, j in itertools.zip_longest(cloud, range(length)):
    plt.subplot((length/2 + 1), 3, j+1)
    plt.subplots_adjust(wspace = 0.2, hspace = 0.5)
    sns.countplot(i, data = all_scientist)
    plt.title(i,size=20)
    plt.ylabel('')
    plt.xlabel('')
    
plt.show()

## Importance of Visualizations

In [178]:
f, ax = plt.subplots(1, 2, figsize = (25, 12))
sns.countplot(all_scientist['JobSkillImportanceVisualizations'], ax = ax[0])
ax[0].set_title('Job Importance for Visuals')
ax[0].set_xlabel('')
all_scientist['WorkDataVisualizations'].value_counts().plot.pie(autopct = '%2.0f%%', colors = sns.color_palette('Paired', 10), ax = ax[1])

ax[1].set_title('Use of Visualizations in Projects')
my_circle = plt.Circle((0, 0), 0.7, color = 'white')
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.ylabel('')
plt.show()

## BI Tools

In [179]:
BI = ['WorkToolsFrequencyQlik', 'WorkToolsFrequencySAPBusinessObjects', 'WorkToolsFrequencyTableau', 'WorkToolsFrequencyTIBCO', 'WorkToolsFrequencyAngoss', 'WorkToolsFrequencyIBMCognos', 'WorkToolsFrequencyKNIMECommercial', 'WorkToolsFrequencyExcel']
plt.subplots(figsize = (23, 12))
length = len(BI)
for i, j in itertools.zip_longest(BI, range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace = 0.2, hspace = 0.5)
    sns.countplot(i, data = scientist)
    plt.title(i, size = 20)
    plt.ylabel('')
    plt.xlabel('')
    
plt.show()

## Knowledge of Algorithms (Maths and Stats)

In [180]:
f, ax = plt.subplots(1, 2, figsize = (25, 12))
sns.countplot(y = all_scientist['AlgorithmUnderstandingLevel'], order = scientist['AlgorithmUnderstandingLevel'].value_counts().index, ax = ax[0], palette = sns.color_palette('summer', 15))
sns.countplot(scientist['JobSkillImportanceStats'], ax = ax[1])
ax[0].set_title('Algorithm Understanding')
ax[0].set_ylabel('')
ax[1].set_title('Knowledge of Stats')
ax[1].set_xlabel('')
plt.show()

## Learning Platform Usefulness

In [183]:
plt.subplots(figsize = (20, 35))
useful = ['LearningPlatformUsefulnessBlogs', 'LearningPlatformUsefulnessCollege', 'LearningPlatformUsefulnessCompany', 'LearningPlatformUsefulnessKaggle', 'LearningPlatformUsefulnessCourses', 'LearningPlatformUsefulnessProjects', 'LearningPlatformUsefulnessTextbook', 'LearningPlatformUsefulnessYouTube']
length = len(useful)
for i, j in itertools.zip_longest(useful, range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace = 0.2, hspace = 0.2)
    scientist[i].value_counts().plot.pie(autopct = '%1.1f%%', colors = ['g', 'lightblue', 'r'], wedgeprops = { 'linewidth' : 2, 'edgecolor' : 'white' })
    plt.title(i, size = 25)
    my_circle = plt.Circle((0, 0), 0.7, color = 'white')
    p = plt.gcf()
    p.gca().add_artist(my_circle)
    plt.xlabel('')
    plt.ylabel('')

plt.show()

## What should the Resume have?

In [184]:
f, ax = plt.subplots(1, 2, figsize = (22, 8))
sns.countplot(y = all_scientist['ProveKnowledgeSelect'], order = all_scientist['ProveKnowledgeSelect'].value_counts().index, ax = ax[0], palette = sns.color_palette('inferno', 15))
ax[0].set_title('How to prove my knowledge')
sns.countplot(all_scientist['JobSkillImportanceKaggleRanking'], ax = ax[1])
ax[1].set_title('Kaggle Rank')
plt.show()

## How did they search for Jobs?

In [185]:
plt.subplots(figsize = (10, 8))
scientist.groupby(['EmployerSearchMethod'])['Age'].count().sort_values(ascending = True).plot.barh(width = 0.8, color = sns.color_palette('winter', 10))
plt.title('Job Search Method', size = 15)
plt.ylabel('')
plt.show()

## Checking the free Responses

In [190]:
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import stopwords

free = pd.read_csv('input/freeformResponses.csv')
stop_words = set(stopwords.words('english'))
stop_words.update(',', ';', '!', '?', '.', '(', ')', '$', '#', '+', ':', '...')

In [193]:
#nltk.download()

In [194]:
motivation = free['KaggleMotivationFreeForm'].dropna().apply(nltk.word_tokenize)
motivate = []
for i in motivation:
    motivate.extend(i)
    
motivate = pd.Series(motivate)
motivate = ([i for i in motivate.str.lower() if i not in stop_words])
f1 = open('kaggle.png', 'wb')
f1.write(codecs.decode(kaggle, 'base64'))
f1.close()
img1 = imread('kaggle.png')
hcmask1 = img1
wc = WordCloud(background_color = 'black', max_words = 4000, mask = hcmask1, stopwords = STOPWORDS, max_font_size = 60, width = 1000, height = 1000)
wc.generate(' '.join(motivate))
plt.imshow(wc)
plt.axis('off')
fig = plt.gcf()
fig.set_size_inches(10, 10)
plt.show()

NameError: name 'kaggle' is not defined

## Most Frequently Used Libraries

In [195]:
library = free['WorkLibrariesFreeForm'].dropna().apply(nltk.word_tokenize)
lib = []
for i in library:
    lib.extend(i)

lib = pd.Series(lib)
lib = ([i for i in lib.str.lower() if i not in stop_words])
lib = pd.Series(lib)
lib = lib.value_counts().reset_index()
lib.loc[lib['index'].str.contains('Pandas|pandas|panda'), 'index'] = 'Pandas'
lib.loc[lib['index'].str.contains('Tensorflow|tensorflow|tf|tensor'), 'index'] = 'Tensorflow'
lib.loc[lib['index'].str.contains('Scikit|scikit|sklearn'), 'index'] = 'Sklearn'
lib = lib.groupby('index')[0].sum().sort_values(ascending = False).to_frame()
R_packages = ['dplyr', 'tidyr', 'ggplot2', 'caret', 'randomforest', 'shiny', 'R markdown', 'ggmap', 'leaflet', 'ggvis', 'stringr', 'tidyverse', 'plotly']
Py_packages = ['Pandas', 'Tensorflow', 'Sklearn', 'matplotlib', 'numpy', 'scipy', 'seaborn', 'keras', 'xgboost', 'nltk', 'plotly']

f, ax = plt.subplots(1, 2, figsize = (18,10))
lib[lib.index.isin(Py_packages)].sort_values(by = 0, ascending = True).plot.barh(ax = ax[0], width = 0.9, color = sns.color_palette('viridis', 15))
ax[0].set_title('Most Frequently Used Py Libraries')

lib[lib.index.isin(R_packages)].sort_values(by = 0, ascending = True).plot.barh(ax = ax[1], width = 0.9, color = sns.color_palette('viridis', 15))
ax[1].set_title('Most Frequently Used R Libraries')
ax[1].set_ylabel('')

plt.show()