In [None]:
'''
Ideas for Blogpost:
- Covid-19
- Survival Analysis
- Time Series with StackOverflow Data

How do recommendations change over time? 
How does income change over time?
Are there trends visible over the 5-year period?

Survival Analysis Blogposts: 
https://towardsdatascience.com/survival-analysis-intuition-implementation-in-python-504fde4fcf8e
https://pub.towardsai.net/survival-analysis-with-python-tutorial-how-what-when-and-why-19a5cfb3c312?sk=65c086e2b02ac43b2d577e02141fd56a&source=friends_link&gi=65238e0712d
https://www.kdnuggets.com/2020/07/complete-guide-survival-analysis-python-part1.html
'''

# setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
from helper_functions import row_to_header
%matplotlib inline


# Blogpost: Analysing StackOverflow Data

StackOverflow:  
- community of software developers, coders, companies 
- public platform for coding questions and answers & other products
- self-reportedly one of the 50 most popular websites in the world 
- different products: StackOverflow, StackOverflow for Teams, Stack Overflow Advertising, Stack Overflow Talent
- supports companies when looking for new employees

https://stackoverflow.com/company

Data is available at: https://insights.stackoverflow.com/survey/

gather information on "all aspects of the developer experience"

In the course, the questions were:
- How to enter the field?
- What are job placement and salary rates for bootcamps?
- What relates to salary/job satisfaction?

Now, additionally look at StackOverflow Survey Data from 2015 to 2019

Suggestions: What languages were most popular in each year? What other changes can you observe over time?

Questions:

0. Why does StackOverflow collect the data? Which questions are included every year? 
- get to know users (e.g. educational and programming background, demographics etc.)
- get info to improve career services, other StackOverflow products for recruiters

specifically: 
- what is connected with higher job satisfaction or better payment? 
- role of remote work

Why is this interesting for recruiters?

1. 
2. 
3. 

## Load & Prepare Data

In [None]:
# import data for year 2015
df_raw_2015 = pd.read_csv('data/survey_results_public_2015.csv', low_memory=False, header=None)
# inspect df 
df_raw_2015.head(3)
# needs cleaning, header is first row

In [None]:
# clean data
# drop first row by selecting all rows from first row onwards
df_2015 = df_raw_2015.iloc[1: , :]
df_2015.head(2)

In [None]:
# use helper function
new_df_2015 = row_to_header(df_2015, 0)
# check df 
new_df_2015.head(2)

In [None]:
# import data for year 2016
df_raw_2016 = pd.read_csv('data/survey_results_public_2016.csv', low_memory=False)
# inspect df 
df_raw_2016.head(2)


In [None]:
# import data for year 2017
df_raw_2017 = pd.read_csv('data/survey_results_public_2017.csv', low_memory=False)
# inspect df 
df_raw_2017.head(2)

In [None]:
# import data for year 2018
df_raw_2018 = pd.read_csv('data/survey_results_public_2018.csv', low_memory=False)
# inspect df 
df_raw_2018.head(2)

In [None]:
# import data for year 2019
df_raw_2019 = pd.read_csv('data/survey_results_public_2019.csv', low_memory=False)
# inspect df 
df_raw_2019.head(2)

In [None]:
# check data types in dfs 
df_raw_2016.dtypes
df_raw_2016.info()
# filter for numeric vars
df_numerics_only = df_raw_2016.select_dtypes(include=np.number)
df_numerics_only 
# filter for categorical vars


### Which variables appear in all years? 

- demographics: age, gender, country
- education: 
- occupation: 
- job satisfaction

In [None]:
new_df_2015.columns.values.tolist()

# ['Country', 'Age', 'Gender',  'Occupation', 'Compensation', 'Compensation: midpoint', 'Employment Status', 'Job Satisfaction',  
# 'Years IT / Programming Experience',"How often are Stack Overflow's answers helpful",


In [None]:
df_raw_2016.columns.values.tolist()

#  'country', 'gender', 'education', 'occupation', 'employment_status', 'salary_range', 'salary_midpoint', 'job_satisfaction', 'why_stack_overflow'

In [None]:
df_raw_2017.columns.values.tolist()

# ['Gender', 'Country', 'FormalEducation', 'Professional', 'Salary', 'JobSatisfaction',

In [None]:
df_raw_2018.columns.values.tolist()

# 'Age', 'Gender','Country', 'Employment', 'Salary', 'FormalEducation', 'JobSatisfaction',

In [None]:
df_raw_2019.columns.values.tolist()

# 'Country', 'Age', 'Gender', 'Employment', 'EdLevel', 'JobSat',

check for missing values

In [None]:
# useful?
num_rows = df.shape[0] #number of rows in the dataset
num_cols = df.shape[1] #number of columns in the dataset

no_nulls = set(df.columns[~df.isnull().any()])  #Provide a set of columns with 0 missing values.
most_missing_cols = df.columns[df.isnull().sum()/len(df) > .75].tolist() #Provide a set of columns with more than 75% of the values missing
most_missing_cols

# drop rows but only when all values are missing
all_row = df.dropna(axis=0, how='all') 

check if columns have correct data type

explore data with bar charts, histograms and scatterplots

In [None]:
# Histograms
status_vals = df['Professional'].value_counts() #pandas series of the counts for each Professional status
# bar chart of the proportion of individuals in each professional category
(status_vals/df.shape[0]).plot(kind="bar");
plt.title("What kind of developer are you?");

In [None]:
# Which variables are of interest? 
# get list of all columns 
new_df_2015.columns.values.tolist()
# ['Country', 'Age', 'Gender', 'Years IT / Programming Experience', 'Occupation', 'Desktop Operating System', 
# 'Employment Status', 'Industry', 'Job Satisfaction', 'Purchasing Power', 'Remote Status', 'Changed Jobs in last 12 Months',
# 'Open to new job opportunities',  
# 'Why use Stack Overflow: Help for job', 'Why use Stack Overflow: To give help',
# "Why use Stack Overflow: Can't do job without it", 'Why use Stack Overflow: Maintain online presence',
# 'Why use Stack Overflow: Demonstrate expertise', 'Why use Stack Overflow: Communicate with others',
# 'Why use Stack Overflow: Receive help on personal projects','Why use Stack Overflow: Love to learn',
# "Why use Stack Overflow: I don't use Stack Overflow", 
# "How often are Stack Overflow's answers helpful",
# 'Why answer: Help a programmer in need', 'Why answer: Help future programmers', 
# 'Why answer: Demonstrate expertise', 'Why answer: Self promotion',
# 'Why answer: Sense of responsibility to developers', 'Why answer: No idea',
# "Why answer: I don't answer and I don't want to", "Why answer: I don't answer but I want to"]


Questions: 

descriptive: Why do people use SO & why do they answer (or not)?
descriptive: 

1. Do people with more work experience in IT find StackOverflow less or more helpful than people with less work experience in IT? 
2. 

In [None]:
# tidy data

# transform data 

In [None]:
# visualise data

In [None]:
#Subset to only quantitative vars
num_vars = df[['Salary', 'CareerSatisfaction', 'HoursPerWeek', 'JobSatisfaction', 'StackOverflowSatisfaction']]

In [None]:
prop_sals = len(num_vars.dropna(subset=['Salary'])) / len(num_vars) # Proportion of individuals in the dataset with salary reported
prop_sals

In [None]:
X = sal_rm[['CareerSatisfaction', 'HoursPerWeek', 'JobSatisfaction', 'StackOverflowSatisfaction']] #Create X using explanatory variables from sal_rm
y = sal_rm['Salary'] #Create y using the response variable of Salary

# Split data into training and test data, and fit a linear model
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=.30, random_state=42)
lm_model = LinearRegression(normalize=True)

# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
try:
    lm_model.fit(X_train, y_train)
except:
    print("Oh no! It doesn't work!!!")

In [None]:
# Remove the rows associated with nan values in any column from num_vars (this was the removal process used in the screencast). Store the dataframe with these rows removed in all_rem.
all_rm = num_vars.dropna() # dataframe with rows for nan in any column removed

In [None]:
# visualising
df_flights.boxplot('dep_time','origin',rot = 30,figsize=(5,6))

## Modeling

In [None]:
# model

# communicate 