In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
def one_hot_encoding(series):
    '''
    Customized Onehot encoding for handling multiple value answers
    
    Input:
    series: pandas series containing the values of survery answers
    
    Output:
    oh_df: dataframe contains the one hot encoded values of the input series
    
    '''
    one_hot = []
    cat_list = series.str.split(';').explode().str.strip().dropna().unique()
    for val in series.values:
        one_hot.append([1 if cat in str(val) else 0 for cat in cat_list])
        
    oh_df = pd.DataFrame(one_hot,columns=cat_list)
    return oh_df

In [None]:
df_free = pd.read_csv('survey_data_17_to_19.csv',low_memory=False)

# Computation of outliers using the IQR method
Q3 = df_free['Salary'].describe()['75%']
Q1 = df_free['Salary'].describe()['25%']
IQR = Q3-Q1
upper_bound = Q3 + 1.5*IQR

# Subsetting the dataframe to keep salaries>0 and not outliers
with_salary_no_outlier = (df_free['Salary']>0) & (df_free['Salary']<=upper_bound)
df_free = df_free[with_salary_no_outlier].reset_index(drop=True)


df_free.head()

Combines all the one hot encoded variables into one_df 

In [None]:
one_df = pd.DataFrame()
for col in ['DeveloperType','LanguageWorkedWith','DatabaseWorkedWith',
            'PlatformWorkedWith','WebFrameWorkedWith']:
    dev_one_hot = one_hot_encoding(df_free[col]).add_prefix(col+': ')
    one_df = pd.concat([one_df,dev_one_hot],axis=1)
one_df['Salary']=df_free['Salary']
one_df.head()

Train and Test split

In [None]:
y_col = 'Salary'
Xs = one_df.drop(columns=y_col)
y = one_df[y_col]

X_train,X_test,y_train,y_test = train_test_split(Xs,y,test_size=0.3,random_state=42)

Modeling

In [None]:
model = LinearRegression()
model.fit(X_train,y_train)
preds = model.predict(X_test)
r2_score(y_test,preds),mean_squared_error(y_test,preds)

Interpretation and Evaluation

In [None]:
feat_coef = pd.DataFrame(X_train.columns,columns=['Feature'])
feat_coef = feat_coef.set_index('Feature')
feat_coef['Coefficient'] = model.coef_

In [None]:
# seleting a subset df as bases for Developertype
contains_col = feat_coef.index.str.contains('DeveloperType')
sub_feat = feat_coef[contains_col]
sub_feat.index = sub_feat.index.str.replace('DeveloperType'+': ','')
coef_vals = sub_feat.sort_values('Coefficient')

# creating the plot and adding labels
coef_vals.plot.barh(color='b',figsize=(10,10))
plt.ylabel('')
plt.title('DeveloperType',fontsize=20)

In [None]:
fig, ax = plt.subplots(2,2,figsize=(15,20))

a = 0
b = 0

for col in ['LanguageWorkedWith','DatabaseWorkedWith','PlatformWorkedWith','WebFrameWorkedWith']:
    plt.figure(figsize=(3,10))
    
    # seleting a subset df as bases for a subplot
    contains_col = feat_coef.index.str.contains(col)
    sub_feat = feat_coef[contains_col]
    sub_feat.index = sub_feat.index.str.replace(col+': ','')
    
    # creating the plot and adding labels
    sub_feat.sort_values('Coefficient').plot.barh(ax=ax[a,b])
    ax[a,b].spines['right'].set_visible(False)
    ax[a,b].spines['top'].set_visible(False)
    ax[a,b].set_ylabel('')
    ax[a,b].set_title(str(col),fontsize=20);
    b+=1
    if(b==2):
        b=0
        a+=1
plt.show();